diff --git a/__pycache__/_config.cpython-37.pyc b/__pycache__/_config.cpython-37.pyc deleted file mode 100644 index 8f84fb0..0000000 Binary files a/__pycache__/_config.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc deleted file mode 100644 index 420c21a..0000000 Binary files a/__pycache__/barlow.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/barlow.cpython-38.pyc b/__pycache__/barlow.cpython-38.pyc deleted file mode 100644 index f9d719e..0000000 Binary files a/__pycache__/barlow.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/barlow.cpython-39.pyc b/__pycache__/barlow.cpython-39.pyc deleted file mode 100644 index 9d69311..0000000 Binary files a/__pycache__/barlow.cpython-39.pyc and /dev/null differ diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc deleted file mode 100644 index b13b62f..0000000 Binary files a/__pycache__/barlow_utils.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/barlow_utils.cpython-38.pyc b/__pycache__/barlow_utils.cpython-38.pyc deleted file mode 100644 index 89d8ded..0000000 Binary files a/__pycache__/barlow_utils.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc deleted file mode 100644 index acc1737..0000000 Binary files a/__pycache__/models.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/models.cpython-38.pyc b/__pycache__/models.cpython-38.pyc deleted file mode 100644 index 13fe121..0000000 Binary files a/__pycache__/models.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/models.cpython-39.pyc b/__pycache__/models.cpython-39.pyc deleted file mode 100644 index ca374b0..0000000 Binary files a/__pycache__/models.cpython-39.pyc and /dev/null differ diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc deleted file mode 100644 index c4b566b..0000000 Binary files a/__pycache__/t_dataset.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/t_dataset.cpython-38.pyc b/__pycache__/t_dataset.cpython-38.pyc deleted file mode 100644 index 1814434..0000000 Binary files a/__pycache__/t_dataset.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/t_dataset.cpython-39.pyc b/__pycache__/t_dataset.cpython-39.pyc deleted file mode 100644 index 245625a..0000000 Binary files a/__pycache__/t_dataset.cpython-39.pyc and /dev/null differ diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc deleted file mode 100644 index b5b1fb5..0000000 Binary files a/__pycache__/train_translation.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/train_translation.cpython-38.pyc b/__pycache__/train_translation.cpython-38.pyc deleted file mode 100644 index 413cf61..0000000 Binary files a/__pycache__/train_translation.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/train_translation.cpython-39.pyc b/__pycache__/train_translation.cpython-39.pyc deleted file mode 100644 index ae42fee..0000000 Binary files a/__pycache__/train_translation.cpython-39.pyc and /dev/null differ diff --git a/__pycache__/translation_dataset.cpython-37.pyc b/__pycache__/translation_dataset.cpython-37.pyc deleted file mode 100644 index 7ac9cd8..0000000 Binary files a/__pycache__/translation_dataset.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/translation_dataset.cpython-38.pyc b/__pycache__/translation_dataset.cpython-38.pyc deleted file mode 100644 index 849c726..0000000 Binary files a/__pycache__/translation_dataset.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/translation_dataset.cpython-39.pyc b/__pycache__/translation_dataset.cpython-39.pyc deleted file mode 100644 index 5c8b8c5..0000000 Binary files a/__pycache__/translation_dataset.cpython-39.pyc and /dev/null differ diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc deleted file mode 100644 index 12c22a5..0000000 Binary files a/__pycache__/translation_utils.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc deleted file mode 100644 index a1e7877..0000000 Binary files a/__pycache__/translation_utils.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/translation_utils.cpython-39.pyc b/__pycache__/translation_utils.cpython-39.pyc deleted file mode 100644 index c4cfb7d..0000000 Binary files a/__pycache__/translation_utils.cpython-39.pyc and /dev/null differ diff --git a/t_dataset.py b/t_dataset.py index 5767e4c..8b4334e 100644 --- a/t_dataset.py +++ b/t_dataset.py @@ -1,3 +1,4 @@ +# edits: padding=True import torch from datasets import load_dataset from transformers import AutoTokenizer @@ -20,40 +21,67 @@ def __init__(self, split = "train" else: split = "test" + print('getting dataset') self.dataset = load_dataset('wmt14', "de-en", split=split) self.de_list = [] self.en_list = [] # self.tokenizer = tokenizer self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') en_list_2 = [] + #for k in range(100):#len(self.dataset)): + # n,i = self.dataset[k] for n, i in enumerate(self.dataset): en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) + #print(n) + if n==500: + break + #print(len(en_list_2)) + # print(max(en_list_2)) + token_res = self.tokenizer(en_list_2, padding=True,max_length=512, return_tensors='pt', truncation=True)['input_ids'] + a1 = list(token_res) self.en_vocab, self.en_vocab_size = vocab(a1) self.bert2id_dict = translation_utils.bert2id(self.en_vocab) self.id2bert_dict = translation_utils.id2bert(self.en_vocab) + + for n, i in enumerate(self.dataset): + #if len(i['translation']['de'])> 400: + # print(len(i['translation']['de'])) + + #elif len(i['translation']['en'])> 400: + # print(len(i['translation']['en'])) + # print(i['translation']['en']) + + #else: + # print(len(i['translation']['de'])) + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + if n==500: + break + for i in self.dataset: self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) - self.en_list.append(self.tokenizer(i['translation']['en'].lower(), + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) - # en_list_id = [] # for i in self.dataset: # en_list_id.append(i['translation']['en'].lower()) de_list_1 = [] for n,i in enumerate(self.dataset): de_list_1.append(i['translation']['de'].lower()) + if n==500: + break - a = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt')['input_ids']) + a = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt',max_length=512, truncation=True)['input_ids']) en_list_1 = [] for n,i in enumerate(self.dataset): en_list_1.append(i['translation']['en'].lower()) + if n==500: + break - b = list(self.tokenizer(de_list_1, padding=True, return_tensors='pt')['input_ids']) + b = list(self.tokenizer(de_list_1, padding=True, max_length=512, return_tensors='pt', truncation=True)['input_ids']) # en_vocab, self.en_vocab_size = vocab(b) self.de_vocab, self.de_vocab_size = vocab(a) diff --git a/t_dataset2.py b/t_dataset2.py new file mode 100644 index 0000000..b7cb015 --- /dev/null +++ b/t_dataset2.py @@ -0,0 +1,157 @@ + +import torch +from datasets import load_dataset +from transformers import AutoTokenizer +# from _config import Config as config +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset + +import translation_utils +from translation_utils import vocab +import os + + +os.environ['TRANSFORMERS_OFFLINE'] = 'yes' +class Translation_dataset_t(Dataset): + + def __init__(self, + train: bool = True): + + if train: + split = "train" + else: + split = "test" + print('getting dataset') + self.dataset = load_dataset('wmt14', "de-en", split=split) + self.de_list = [] + self.en_list = [] +# self.tokenizer = tokenizer + self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') + en_list_2 = [] + #for k in range(100):#len(self.dataset)): + # n,i = self.dataset[k] + for n, i in enumerate(self.dataset): + en_list_2.append(i['translation']['en'].lower()) + #print(n) + if n==500: + break + print(len(en_list_2)) + # print(max(en_list_2)) + print('error not found') + token_res = self.tokenizer(en_list_2, padding='max_length',max_length=512, return_tensors='pt', truncation=True)['input_ids'] + a1 = list(token_res) + print('error') + self.en_vocab, self.en_vocab_size = vocab(a1) + self.bert2id_dict = translation_utils.bert2id(self.en_vocab) + self.id2bert_dict = translation_utils.id2bert(self.en_vocab) + print('e') + + + for n, i in enumerate(self.dataset): + #if len(i['translation']['de'])> 400: + # print(len(i['translation']['de'])) + + #elif len(i['translation']['en'])> 400: + # print(len(i['translation']['en'])) + # print(i['translation']['en']) + + #else: + # print(len(i['translation']['de'])) + if len(i['translation']['de'].lower()) > 500: + pass + elif len(i['translation']['en'].lower())>500: + pass + + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), padding='max_length', return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), padding='max_length', return_tensors='pt',max_length=512, truncation=True)["input_ids"]) + # if n==500: + # break + ''' + for i in self.dataset: + self.de_list.append(self.tokenizer(i['translation']['de'].lower(), + padding=True, return_tensors='pt')["input_ids"]) + self.en_list.append(self.tokenizer(i['translation']['en'].lower(), + padding=True, return_tensors='pt')["input_ids"]) + ''' + # en_list_id = [] + # for i in self.dataset: + # en_list_id.append(i['translation']['en'].lower()) + + de_list_1 = [] + for n,i in enumerate(self.dataset): + + if len(i['translation']['de'].lower()) > 500: + pass + elif len(i['translation']['en'].lower())>500: + pass + de_list_1.append(i['translation']['de'].lower()) + #if n==500: + #break + + a = list(self.tokenizer(de_list_1, padding='max_length', return_tensors='pt',max_length=512, truncation=True)['input_ids']) + + en_list_1 = [] + for n,i in enumerate(self.dataset): + en_list_1.append(i['translation']['en'].lower()) + if n==500: + break + + b = list(self.tokenizer(de_list_1, padding='max_length', max_length=512, return_tensors='pt', truncation=True)['input_ids']) + # en_vocab, self.en_vocab_size = vocab(b) + self.de_vocab, self.de_vocab_size = vocab(a) + + + #should return the length of the dataset + def __len__(self): + return len(self.de_list) + + #should return a particular example + def __getitem__(self, index): + src = self.de_list[index] + trg = self.en_list[index] + + return {'src':src, 'trg':trg} + + + +class MyCollate: + def __init__(self, + tokenizer, + bert2id_dict: dict): + self.tokenizer = tokenizer + self.pad_idx = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token) + self.bert2id_dict = bert2id_dict + + def __call__(self, batch): + + source = [] + for i in batch: + source.append(i['src'].T) + #print(source[0].shape, source[1].shape) + source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx) + + target = [] + for i in batch: + target.append(i['trg'].T) + target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx) + + target_inp = target.squeeze(-1)[:-1, :] + target_out = torch.zeros(target.shape) + + for i in range(len(target)): + for j in range(len(target[i])): + try: + target_out[i][j] = self.bert2id_dict[target[i][j].item()] + except KeyError: + target_out[i][j] = self.tokenizer.unk_token_id + + target_out = target_out.squeeze(-1)[1:, :] + + return source.squeeze(), target.squeeze().long(), target_inp.squeeze().long(), target_out.squeeze().long() + + +# dataset = Translation_dataset() +# loader = DataLoader(dataset=dataset, +# batch_size= 32, +# shuffle=False, +# collate_fn=MyCollate()) diff --git a/train_translation.py b/train_translation.py index 64cda2f..9f5b778 100644 --- a/train_translation.py +++ b/train_translation.py @@ -17,6 +17,7 @@ import t_dataset from t_dataset import Translation_dataset_t from t_dataset import MyCollate +import translation_dataset import translation_utils from translation_utils import TokenEmbedding, PositionalEncoding from translation_utils import create_mask @@ -149,10 +150,11 @@ def main_worker(gpu, args): world_size=args.world_size, rank=args.rank) if args.rank == 0: - + ''' wandb.init(config=args, project='translation_test')############################################# wandb.config.update(args) config = wandb.config + ''' # exit() args.checkpoint_dir.mkdir(parents=True, exist_ok=True) @@ -163,7 +165,11 @@ def main_worker(gpu, args): torch.cuda.set_device(gpu) torch.backends.cudnn.benchmark = True +# print('loading barlow dataset') +# dataset = translation_dataset.Translation_dataset() + print('loading translation dataset') dataset = Translation_dataset_t(train=args.train) + print('dataset loaded') src_vocab_size = dataset.de_vocab_size trg_vocab_size = dataset.en_vocab_size tokenizer = dataset.tokenizer @@ -236,10 +242,11 @@ def main_worker(gpu, args): per_device_batch_size = args.batch_size // args.world_size id2bert_dict = dataset.id2bert_dict ############################### + print('instantiating dataloader') loader = torch.utils.data.DataLoader( dataset, batch_size=per_device_batch_size, num_workers=args.workers, pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - + print('loaded on cuda') test_loader = torch.utils.data.DataLoader( dataset, batch_size=1, num_workers=args.workers, pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) @@ -283,7 +290,7 @@ def main_worker(gpu, args): print(json.dumps(stats), file=stats_file) if args.rank == 0: - wandb.log({"epoch_loss":epoch_loss/t}) + #wandb.log({"epoch_loss":epoch_loss/t}) # save checkpoint state = dict(epoch=epoch + 1, model=model.module.state_dict(), optimizer=optimizer.state_dict()) @@ -296,7 +303,7 @@ def main_worker(gpu, args): if epoch%args.checkbleu ==0 : bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) + #wandb.log({'bleu_score': bleu_score}) # print(bleu_score(predicted, target)) ############################################################## # if epoch%1 ==0 : @@ -309,14 +316,14 @@ def main_worker(gpu, args): # optimizer=optimizer.state_dict()) # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') # print('saved translation model in', args.checkpoint_dir) - wandb.finish() + #wandb.finish() else: bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) +# if args.rank == 0: + #wandb.log({'bleu_score': bleu_score}) def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): @@ -366,6 +373,10 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): memory = memory tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) .type(torch.bool)).cuda(gpu, non_blocking=True) + + print('ys shape: ', ys.shape) + print('memory.shape', memory.shape) + print('tgt_mask.shape', tgt_mask.shape) out = model.module.decode(ys, memory, tgt_mask) out = out.transpose(0, 1) prob = model.module.generator(out[:, -1]) @@ -400,4 +411,4 @@ def translate(model: torch.nn.Module, if __name__ == '__main__': main() - wandb.finish() + #wandb.finish() diff --git a/translation_dataset.py b/translation_dataset.py index 274c2f3..9dec23e 100644 --- a/translation_dataset.py +++ b/translation_dataset.py @@ -16,8 +16,16 @@ def __init__(self): self.en_list = [] for i in self.dataset: - self.de_list.append(tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) - self.en_list.append(tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) + if len(i['translation']['de'])> 400: + #print(len(i['translation']['de'])) + pass + elif len(i['translation']['en'])> 400: + #print(len(i['translation']['en'])) + pass + else: + # print(len(i['translation']['de'])) + self.de_list.append(tokenizer(i['translation']['de'].lower(), padding=True, return_tensors='pt')["input_ids"]) + self.en_list.append(tokenizer(i['translation']['en'].lower(), padding=True, return_tensors='pt')["input_ids"]) diff --git a/translation_utils.py b/translation_utils.py index af3437a..747b03f 100644 --- a/translation_utils.py +++ b/translation_utils.py @@ -88,14 +88,31 @@ def __init__(self, emb_size, mbert): super(TokenEmbedding, self).__init__() # self.embedding = nn.Embedding(vocab_size, emb_size) self.embedding = mbert -# for param in self.embedding.parameters(): -# param.requires_grad = False -# for param in self.embedding.pooler.parameters(): -# param.requires_grad = True + for param in self.embedding.parameters(): + param.requires_grad = False + for param in self.embedding.pooler.parameters(): + param.requires_grad = True self.emb_size = emb_size def forward(self, tokens: torch.tensor): # print(tokens.shape) if len(tokens.shape) ==1: tokens = tokens.unsqueeze(-1) + + try: + self.embedding(tokens.long().T)['last_hidden_state'] + except RuntimeError: + print('errored') + return self.embedding(tokens.long().T)['last_hidden_state'].permute(1, 0, 2) * math.sqrt(self.emb_size) + + # try: + + +''' + except RuntimeError: + print('errored') + b = torch.zeros(tokens.shape[0], 1, 768) + pass + +''' diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log deleted file mode 120000 index 5c95722..0000000 --- a/wandb/debug-internal.log +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa/logs/debug-internal.log \ No newline at end of file diff --git a/wandb/debug.log b/wandb/debug.log deleted file mode 120000 index c54d1ec..0000000 --- a/wandb/debug.log +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa/logs/debug.log \ No newline at end of file diff --git a/wandb/latest-run b/wandb/latest-run deleted file mode 120000 index 34b339f..0000000 --- a/wandb/latest-run +++ /dev/null @@ -1 +0,0 @@ -run-20220416_014323-1a0lobwa \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py b/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py deleted file mode 100644 index c6ab0ef..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) - tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml b/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_190620-2py0vpvt/files/config.yaml b/wandb/run-20220415_190620-2py0vpvt/files/config.yaml deleted file mode 100644 index b88038a..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 1 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 1 diff --git a/wandb/run-20220415_190620-2py0vpvt/files/diff.patch b/wandb/run-20220415_190620-2py0vpvt/files/diff.patch deleted file mode 100644 index 0634eb7..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/diff.patch +++ /dev/null @@ -1,30635 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..f232b40 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,51 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..c6ab0ef 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,15 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -+ tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +379,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +387,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..40790bc 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_190620-2py0vpvt/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..6613878 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_190620-2py0vpvt/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..1188b40 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_190620-2py0vpvt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_190620-2py0vpvt/files/output.log b/wandb/run-20220415_190620-2py0vpvt/files/output.log deleted file mode 100644 index ee1cf94..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/output.log +++ /dev/null @@ -1,77 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -translation model saved in checkpoint -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt b/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json b/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json deleted file mode 100644 index 7fdc37d..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T13:36:21.737888", - "startedAt": "2022-04-15T13:36:20.741849", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json b/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json deleted file mode 100644 index 6c757d0..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 113.57089103062948, "_runtime": 35, "_timestamp": 1650029815, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log b/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log deleted file mode 100644 index 896a0da..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log +++ /dev/null @@ -1,118 +0,0 @@ -2022-04-15 19:06:20,774 INFO wandb_internal:5906 [internal.py:wandb_internal():91] W&B internal server running at pid: 5906, started at: 2022-04-15 19:06:20.773660 -2022-04-15 19:06:20,798 INFO MainThread:5906 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:06:20,798 DEBUG MainThread:5906 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 19:06:20,799 INFO MainThread:5906 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:06:20,809 INFO MainThread:5906 [wandb_init.py:init():484] communicating current version -2022-04-15 19:06:20,822 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 19:06:20,823 DEBUG SenderThread:5906 [sender.py:send():179] send: header -2022-04-15 19:06:20,822 INFO WriterThread:5906 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:06:20,824 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: check_version -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:06:21,046 DEBUG SenderThread:5906 [sender.py:send():179] send: run -2022-04-15 19:06:21,723 INFO MainThread:5906 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:06:21,723 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:__init__():39] meta init -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:__init__():53] meta init done -2022-04-15 19:06:21,737 DEBUG HandlerThread:5906 [meta.py:probe():210] probe -2022-04-15 19:06:21,744 DEBUG HandlerThread:5906 [meta.py:_setup_git():200] setup git -2022-04-15 19:06:21,781 INFO SenderThread:5906 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_start_run_threads():707] run started: 2py0vpvt with start time 1650029780 -2022-04-15 19:06:21,782 DEBUG SenderThread:5906 [sender.py:send():179] send: summary -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:21,821 DEBUG HandlerThread:5906 [meta.py:_setup_git():207] setup git done -2022-04-15 19:06:21,821 DEBUG HandlerThread:5906 [meta.py:_save_code():89] save code -2022-04-15 19:06:21,858 DEBUG HandlerThread:5906 [meta.py:_save_code():110] save code done -2022-04-15 19:06:21,858 DEBUG HandlerThread:5906 [meta.py:_save_patches():127] save patches -2022-04-15 19:06:22,072 DEBUG HandlerThread:5906 [meta.py:_save_patches():169] save patches done -2022-04-15 19:06:22,072 DEBUG HandlerThread:5906 [meta.py:_save_pip():57] save pip -2022-04-15 19:06:22,073 DEBUG HandlerThread:5906 [meta.py:_save_pip():71] save pip done -2022-04-15 19:06:22,073 DEBUG HandlerThread:5906 [meta.py:_save_conda():78] save conda -2022-04-15 19:06:22,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code -2022-04-15 19:06:25,546 DEBUG HandlerThread:5906 [meta.py:_save_conda():86] save conda done -2022-04-15 19:06:25,546 DEBUG HandlerThread:5906 [meta.py:probe():252] probe done -2022-04-15 19:06:25,549 DEBUG SenderThread:5906 [sender.py:send():179] send: files -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:06:25,551 INFO SenderThread:5906 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:06:25,581 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:25,581 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:06:25,593 INFO MainThread:5906 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:06:25,634 INFO MainThread:5906 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json -2022-04-15 19:06:26,074 DEBUG SenderThread:5906 [sender.py:send():179] send: config -2022-04-15 19:06:26,807 INFO Thread-14 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/373ehk48-wandb-metadata.json -2022-04-15 19:06:26,833 INFO Thread-16 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/16e4mjp9-code/train_translation.py -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:29,014 INFO Thread-18 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/2zz8ar1z-diff.patch -2022-04-15 19:06:29,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:33,774 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:41,075 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:41,076 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:06:47,843 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:49,844 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:50,925 DEBUG SenderThread:5906 [sender.py:send():179] send: stats -2022-04-15 19:06:51,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:53,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,022 DEBUG SenderThread:5906 [sender.py:send():179] send: history -2022-04-15 19:06:55,022 DEBUG SenderThread:5906 [sender.py:send():179] send: summary -2022-04-15 19:06:55,023 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:56,825 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:06:56,825 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:07:11,877 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:12,360 DEBUG HandlerThread:5906 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:07:12,361 DEBUG SenderThread:5906 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:07:19,703 INFO WriterThread:5906 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:07:19,860 INFO SenderThread:5906 [sender.py:finish():933] shutting down sender -2022-04-15 19:07:19,860 INFO SenderThread:5906 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:07:19,879 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt requirements.txt -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json wandb-summary.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml config.yaml -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch diff.patch -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py code/train_translation.py -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:07:21,094 INFO Thread-25 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:07:21,208 INFO Thread-29 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:07:21,219 INFO Thread-26 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:21,814 INFO Thread-27 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:07:22,524 INFO Thread-28 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:23,194 ERROR wandb_internal:5906 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,088 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,091 INFO MainThread:5906 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_190620-2py0vpvt/logs/debug.log b/wandb/run-20220415_190620-2py0vpvt/logs/debug.log deleted file mode 100644 index a71d0fa..0000000 --- a/wandb/run-20220415_190620-2py0vpvt/logs/debug.log +++ /dev/null @@ -1,94 +0,0 @@ -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/logs/debug.log -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/logs/debug-internal.log -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():369] calling init triggers -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:20,743 INFO MainThread:5906 [wandb_init.py:init():418] starting backend -2022-04-15 19:06:20,751 INFO MainThread:5906 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 19:06:20,773 INFO MainThread:5906 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 19:06:20,774 INFO wandb_internal:5906 [internal.py:wandb_internal():91] W&B internal server running at pid: 5906, started at: 2022-04-15 19:06:20.773660 -2022-04-15 19:06:20,798 INFO MainThread:5906 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:06:20,799 INFO MainThread:5906 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:06:20,809 INFO MainThread:5906 [wandb_init.py:init():484] communicating current version -2022-04-15 19:06:20,822 INFO WriterThread:5906 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:06:21,045 INFO MainThread:5906 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:06:21,723 INFO MainThread:5906 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:06:21,781 INFO SenderThread:5906 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_start_run_threads():707] run started: 2py0vpvt with start time 1650029780 -2022-04-15 19:06:21,782 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:22,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:22,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:06:25,549 INFO SenderThread:5906 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:06:25,551 INFO SenderThread:5906 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:06:25,593 INFO MainThread:5906 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:06:25,594 INFO MainThread:5906 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:06:25,633 INFO MainThread:5906 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:06:25,634 INFO MainThread:5906 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:25,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json -2022-04-15 19:06:26,807 INFO Thread-14 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/373ehk48-wandb-metadata.json -2022-04-15 19:06:26,833 INFO Thread-16 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/16e4mjp9-code/train_translation.py -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:06:27,769 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:29,014 INFO Thread-18 :5906 [upload_job.py:push():133] Uploaded file /tmp/tmpbd0vash4wandb/2zz8ar1z-diff.patch -2022-04-15 19:06:29,770 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:33,774 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:47,843 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:49,844 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:51,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:53,845 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,023 INFO SenderThread:5906 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:06:55,851 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:11,877 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,703 INFO WriterThread:5906 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb -2022-04-15 19:07:19,860 INFO SenderThread:5906 [sender.py:finish():933] shutting down sender -2022-04-15 19:07:19,860 INFO SenderThread:5906 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:07:19,879 INFO Thread-12 :5906 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt requirements.txt -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log output.log -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json wandb-summary.json -2022-04-15 19:07:19,880 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml config.yaml -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/diff.patch diff.patch -2022-04-15 19:07:19,881 INFO SenderThread:5906 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/code/train_translation.py code/train_translation.py -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:07:19,881 INFO SenderThread:5906 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:07:21,094 INFO Thread-25 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/requirements.txt -2022-04-15 19:07:21,208 INFO Thread-29 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/config.yaml -2022-04-15 19:07:21,219 INFO Thread-26 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/output.log -2022-04-15 19:07:21,814 INFO Thread-27 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/conda-environment.yaml -2022-04-15 19:07:22,524 INFO Thread-28 :5906 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_190620-2py0vpvt/files/wandb-summary.json -2022-04-15 19:07:23,194 ERROR wandb_internal:5906 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 19:34:32,989 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,088 INFO MainThread:5906 [wandb_run.py:_restore():1480] restore -2022-04-15 19:34:33,091 INFO MainThread:5906 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb b/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb deleted file mode 100644 index 30e91cb..0000000 Binary files a/wandb/run-20220415_190620-2py0vpvt/run-2py0vpvt.wandb and /dev/null differ diff --git a/wandb/run-20220415_193521-231emzap/files/code/train_translation.py b/wandb/run-20220415_193521-231emzap/files/code/train_translation.py deleted file mode 100644 index c6ab0ef..0000000 --- a/wandb/run-20220415_193521-231emzap/files/code/train_translation.py +++ /dev/null @@ -1,400 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) - tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml b/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_193521-231emzap/files/config.yaml b/wandb/run-20220415_193521-231emzap/files/config.yaml deleted file mode 100644 index 4ed8c75..0000000 --- a/wandb/run-20220415_193521-231emzap/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220415_193521-231emzap/files/diff.patch b/wandb/run-20220415_193521-231emzap/files/diff.patch deleted file mode 100644 index b1ff87d..0000000 --- a/wandb/run-20220415_193521-231emzap/files/diff.patch +++ /dev/null @@ -1,30645 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..babc6a1 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,61 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..c6ab0ef 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,15 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -+ tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +379,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +387,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..18bad28 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_193521-231emzap/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..cb81c04 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_193521-231emzap/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..c168413 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_193521-231emzap -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_193521-231emzap/files/output.log b/wandb/run-20220415_193521-231emzap/files/output.log deleted file mode 100644 index 301455d..0000000 --- a/wandb/run-20220415_193521-231emzap/files/output.log +++ /dev/null @@ -1,77 +0,0 @@ - -train_translation.py -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -translation model saved in checkpoint -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/files/requirements.txt b/wandb/run-20220415_193521-231emzap/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_193521-231emzap/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json b/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json deleted file mode 100644 index 02e1ef7..0000000 --- a/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T14:05:22.557883", - "startedAt": "2022-04-15T14:05:21.616163", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_193521-231emzap/files/wandb-summary.json b/wandb/run-20220415_193521-231emzap/files/wandb-summary.json deleted file mode 100644 index 3c99905..0000000 --- a/wandb/run-20220415_193521-231emzap/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 103.21329364776611, "_runtime": 1149, "_timestamp": 1650032670, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220415_193521-231emzap/logs/debug-internal.log b/wandb/run-20220415_193521-231emzap/logs/debug-internal.log deleted file mode 100644 index 88e8878..0000000 --- a/wandb/run-20220415_193521-231emzap/logs/debug-internal.log +++ /dev/null @@ -1,302 +0,0 @@ -2022-04-15 19:35:21,654 INFO wandb_internal:6227 [internal.py:wandb_internal():91] W&B internal server running at pid: 6227, started at: 2022-04-15 19:35:21.641638 -2022-04-15 19:35:21,661 INFO MainThread:6227 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:35:21,661 DEBUG MainThread:6227 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 19:35:21,663 INFO MainThread:6227 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:35:21,677 INFO MainThread:6227 [wandb_init.py:init():484] communicating current version -2022-04-15 19:35:21,707 DEBUG SenderThread:6227 [sender.py:send():179] send: header -2022-04-15 19:35:21,705 INFO WriterThread:6227 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:35:21,707 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 19:35:21,707 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: check_version -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:35:21,929 DEBUG SenderThread:6227 [sender.py:send():179] send: run -2022-04-15 19:35:22,542 INFO MainThread:6227 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:35:22,543 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:__init__():39] meta init -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:__init__():53] meta init done -2022-04-15 19:35:22,557 DEBUG HandlerThread:6227 [meta.py:probe():210] probe -2022-04-15 19:35:22,564 DEBUG HandlerThread:6227 [meta.py:_setup_git():200] setup git -2022-04-15 19:35:22,618 INFO SenderThread:6227 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:35:22,618 INFO SenderThread:6227 [sender.py:_start_run_threads():707] run started: 231emzap with start time 1650031521 -2022-04-15 19:35:22,618 DEBUG SenderThread:6227 [sender.py:send():179] send: summary -2022-04-15 19:35:22,619 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:35:22,641 DEBUG HandlerThread:6227 [meta.py:_setup_git():207] setup git done -2022-04-15 19:35:22,641 DEBUG HandlerThread:6227 [meta.py:_save_code():89] save code -2022-04-15 19:35:22,667 DEBUG HandlerThread:6227 [meta.py:_save_code():110] save code done -2022-04-15 19:35:22,668 DEBUG HandlerThread:6227 [meta.py:_save_patches():127] save patches -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_patches():169] save patches done -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_pip():57] save pip -2022-04-15 19:35:22,900 DEBUG HandlerThread:6227 [meta.py:_save_pip():71] save pip done -2022-04-15 19:35:22,901 DEBUG HandlerThread:6227 [meta.py:_save_conda():78] save conda -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:35:23,605 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code -2022-04-15 19:35:26,867 DEBUG HandlerThread:6227 [meta.py:_save_conda():86] save conda done -2022-04-15 19:35:26,867 DEBUG HandlerThread:6227 [meta.py:probe():252] probe done -2022-04-15 19:35:26,874 DEBUG SenderThread:6227 [sender.py:send():179] send: files -2022-04-15 19:35:26,874 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:35:26,897 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:26,897 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:35:26,909 INFO MainThread:6227 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:35:26,968 INFO MainThread:6227 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:27,399 DEBUG SenderThread:6227 [sender.py:send():179] send: config -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json -2022-04-15 19:35:27,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:28,289 INFO Thread-14 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3l2un8y7-wandb-metadata.json -2022-04-15 19:35:28,309 INFO Thread-16 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/2642x5u1-code/train_translation.py -2022-04-15 19:35:29,248 INFO Thread-18 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3w1kgl5c-diff.patch -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:33,612 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:42,401 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:42,401 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:35:51,746 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:35:57,983 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:35:57,983 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:07,625 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:09,626 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:14,234 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:14,234 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:22,435 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:36:29,784 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:29,784 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:45,336 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:36:45,336 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:36:53,115 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:37:00,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:00,854 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:16,412 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:16,412 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:23,775 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:37:31,915 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:31,915 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:47,561 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:37:47,562 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:37:54,451 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:38:03,052 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:03,052 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:15,668 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:38:18,601 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:18,601 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:25,150 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:38:34,137 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:34,138 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:49,657 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:38:49,657 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:38:55,860 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:39:05,152 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:05,153 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:20,952 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:20,952 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:26,548 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:39:36,524 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:36,524 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:52,137 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:39:52,138 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:39:57,181 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:40:07,658 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:07,658 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:21,704 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:40:23,266 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:23,267 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:27,854 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:40:38,901 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:38,901 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:54,413 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:40:54,413 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:40:58,482 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:41:09,931 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:09,931 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:25,494 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:25,494 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:29,163 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:41:41,013 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:41,013 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:56,570 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:41:56,570 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:41:59,758 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:42:12,095 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:12,095 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:25,749 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:42:27,592 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:27,592 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:30,434 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:42:43,166 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:43,166 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:42:58,751 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:42:58,751 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:01,144 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:43:14,279 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:14,280 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:29,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:29,855 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:43:31,764 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:43:45,403 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:43:45,403 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:00,964 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:00,965 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:02,446 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:44:17,234 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:17,234 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:29,793 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:44:32,848 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:32,848 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:44:33,426 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:44:48,428 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:44:48,428 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:03,865 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:45:04,062 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:04,062 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:19,623 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:19,623 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:34,533 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:45:35,138 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:35,139 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:45:50,645 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:45:50,645 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:05,254 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:46:06,221 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:06,221 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:21,766 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:21,766 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:35,925 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:46:37,397 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:37,397 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:46:37,828 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:46:52,955 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:46:52,955 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:06,616 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:47:08,555 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:08,555 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:24,064 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:24,064 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:37,263 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:47:39,645 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:39,646 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:47:55,185 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:47:55,185 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:07,887 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:48:10,707 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:10,707 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:26,268 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:26,269 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:38,517 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:48:41,854 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:41,854 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:48:43,870 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:48:57,355 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:48:57,355 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:09,161 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:49:13,066 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:13,066 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:28,650 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:28,651 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:39,887 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:49:44,321 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:44,322 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:49:59,888 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:49:59,888 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:10,589 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:50:15,420 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:15,421 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:30,986 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:30,987 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:41,331 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:50:46,616 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:50:46,617 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:50:47,905 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:51:02,176 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:02,176 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:12,008 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:51:17,725 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:17,725 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:33,508 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:33,508 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:51:42,613 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:51:49,212 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:51:49,212 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:04,733 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:04,733 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:13,263 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:52:20,327 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:20,327 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:35,877 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:35,877 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:43,808 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:52:51,414 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:52:51,414 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:52:54,940 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:53:07,030 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:07,030 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:14,500 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:53:22,649 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:22,650 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:38,185 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:38,185 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:53:45,170 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:53:53,780 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:53:53,780 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:09,368 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:09,368 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:15,790 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:54:24,949 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:24,949 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:30,772 DEBUG SenderThread:6227 [sender.py:send():179] send: history -2022-04-15 19:54:30,772 DEBUG SenderThread:6227 [sender.py:send():179] send: summary -2022-04-15 19:54:30,772 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:54:30,966 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:54:40,527 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:40,528 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:46,854 DEBUG SenderThread:6227 [sender.py:send():179] send: stats -2022-04-15 19:54:48,284 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:56,050 DEBUG HandlerThread:6227 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 19:54:56,050 DEBUG SenderThread:6227 [sender.py:send_request():193] send_request: stop_status -2022-04-15 19:54:58,727 INFO SenderThread:6227 [sender.py:finish():933] shutting down sender -2022-04-15 19:54:58,727 INFO SenderThread:6227 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:54:59,092 INFO WriterThread:6227 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt requirements.txt -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log output.log -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json wandb-summary.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml config.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch diff.patch -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py code/train_translation.py -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:55:00,848 INFO Thread-29 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:55:00,870 INFO Thread-25 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:55:00,895 INFO Thread-28 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:55:00,913 INFO Thread-27 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:55:00,979 INFO Thread-26 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:55:02,041 ERROR wandb_internal:6227 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,757 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,761 INFO MainThread:6227 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_193521-231emzap/logs/debug.log b/wandb/run-20220415_193521-231emzap/logs/debug.log deleted file mode 100644 index 18e01c4..0000000 --- a/wandb/run-20220415_193521-231emzap/logs/debug.log +++ /dev/null @@ -1,97 +0,0 @@ -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/logs/debug.log -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/logs/debug-internal.log -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():369] calling init triggers -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:21,618 INFO MainThread:6227 [wandb_init.py:init():418] starting backend -2022-04-15 19:35:21,630 INFO MainThread:6227 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 19:35:21,641 INFO MainThread:6227 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 19:35:21,654 INFO wandb_internal:6227 [internal.py:wandb_internal():91] W&B internal server running at pid: 6227, started at: 2022-04-15 19:35:21.641638 -2022-04-15 19:35:21,661 INFO MainThread:6227 [wandb_init.py:init():423] backend started and connected -2022-04-15 19:35:21,663 INFO MainThread:6227 [wandb_init.py:init():465] updated telemetry -2022-04-15 19:35:21,677 INFO MainThread:6227 [wandb_init.py:init():484] communicating current version -2022-04-15 19:35:21,705 INFO WriterThread:6227 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 19:35:21,919 INFO MainThread:6227 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 19:35:22,542 INFO MainThread:6227 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 19:35:22,618 INFO SenderThread:6227 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:35:22,618 INFO SenderThread:6227 [sender.py:_start_run_threads():707] run started: 231emzap with start time 1650031521 -2022-04-15 19:35:22,619 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch -2022-04-15 19:35:23,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:35:23,605 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code -2022-04-15 19:35:26,874 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 19:35:26,875 INFO SenderThread:6227 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 19:35:26,909 INFO MainThread:6227 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 19:35:26,913 INFO MainThread:6227 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 19:35:26,967 INFO MainThread:6227 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 19:35:26,968 INFO MainThread:6227 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:35:27,603 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json -2022-04-15 19:35:27,604 INFO Thread-12 :6227 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:28,289 INFO Thread-14 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3l2un8y7-wandb-metadata.json -2022-04-15 19:35:28,309 INFO Thread-16 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/2642x5u1-code/train_translation.py -2022-04-15 19:35:29,248 INFO Thread-18 :6227 [upload_job.py:push():133] Uploaded file /tmp/tmpit2mxldiwandb/3w1kgl5c-diff.patch -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:35:29,611 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:35:33,612 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:07,625 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:36:09,626 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:38:15,668 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:40:21,704 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:42:25,749 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:44:29,793 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:46:37,828 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:48:43,870 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:50:47,905 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:52:54,940 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:30,772 INFO SenderThread:6227 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 19:54:30,966 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:54:48,284 INFO Thread-12 :6227 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:54:58,727 INFO SenderThread:6227 [sender.py:finish():933] shutting down sender -2022-04-15 19:54:58,727 INFO SenderThread:6227 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 19:54:59,092 INFO WriterThread:6227 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/run-231emzap.wandb -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt requirements.txt -2022-04-15 19:54:59,343 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-metadata.json wandb-metadata.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log output.log -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml conda-environment.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json wandb-summary.json -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml config.yaml -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/diff.patch diff.patch -2022-04-15 19:54:59,344 INFO SenderThread:6227 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/code/train_translation.py code/train_translation.py -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 19:54:59,345 INFO SenderThread:6227 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 19:55:00,848 INFO Thread-29 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/config.yaml -2022-04-15 19:55:00,870 INFO Thread-25 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/requirements.txt -2022-04-15 19:55:00,895 INFO Thread-28 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/wandb-summary.json -2022-04-15 19:55:00,913 INFO Thread-27 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/conda-environment.yaml -2022-04-15 19:55:00,979 INFO Thread-26 :6227 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_193521-231emzap/files/output.log -2022-04-15 19:55:02,041 ERROR wandb_internal:6227 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:07:26,362 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,757 INFO MainThread:6227 [wandb_run.py:_restore():1480] restore -2022-04-15 20:07:26,761 INFO MainThread:6227 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_193521-231emzap/run-231emzap.wandb b/wandb/run-20220415_193521-231emzap/run-231emzap.wandb deleted file mode 100644 index ceb5081..0000000 Binary files a/wandb/run-20220415_193521-231emzap/run-231emzap.wandb and /dev/null differ diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py b/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py deleted file mode 100644 index a7a253c..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py +++ /dev/null @@ -1,401 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml b/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/config.yaml b/wandb/run-20220415_203240-1bwp8j0o/files/config.yaml deleted file mode 100644 index 4ed8c75..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch b/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch deleted file mode 100644 index 8d75a67..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch +++ /dev/null @@ -1,30655 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..296d49a 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,71 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..a7a253c 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,16 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +380,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +388,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..b09d5c6 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_203240-1bwp8j0o/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..65fde58 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_203240-1bwp8j0o/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..a7a7812 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_203240-1bwp8j0o -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/output.log b/wandb/run-20220415_203240-1bwp8j0o/files/output.log deleted file mode 100644 index e69de29..0000000 diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt b/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json b/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json deleted file mode 100644 index 635bb75..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T15:02:42.085900", - "startedAt": "2022-04-15T15:02:40.953964", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json b/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json deleted file mode 100644 index 9e26dfe..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log b/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log deleted file mode 100644 index 6491045..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log +++ /dev/null @@ -1,56 +0,0 @@ -2022-04-15 20:32:40,986 INFO wandb_internal:6751 [internal.py:wandb_internal():91] W&B internal server running at pid: 6751, started at: 2022-04-15 20:32:40.973711 -2022-04-15 20:32:40,989 INFO MainThread:6751 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:32:40,989 DEBUG MainThread:6751 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 20:32:40,991 INFO MainThread:6751 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:32:41,002 INFO MainThread:6751 [wandb_init.py:init():484] communicating current version -2022-04-15 20:32:41,033 DEBUG SenderThread:6751 [sender.py:send():179] send: header -2022-04-15 20:32:41,033 INFO WriterThread:6751 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb -2022-04-15 20:32:41,034 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 20:32:41,034 DEBUG SenderThread:6751 [sender.py:send_request():193] send_request: check_version -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:32:41,355 DEBUG SenderThread:6751 [sender.py:send():179] send: run -2022-04-15 20:32:42,068 INFO MainThread:6751 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:32:42,068 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:__init__():39] meta init -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:__init__():53] meta init done -2022-04-15 20:32:42,085 DEBUG HandlerThread:6751 [meta.py:probe():210] probe -2022-04-15 20:32:42,092 DEBUG HandlerThread:6751 [meta.py:_setup_git():200] setup git -2022-04-15 20:32:42,154 INFO SenderThread:6751 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files -2022-04-15 20:32:42,154 INFO SenderThread:6751 [sender.py:_start_run_threads():707] run started: 1bwp8j0o with start time 1650034960 -2022-04-15 20:32:42,154 DEBUG SenderThread:6751 [sender.py:send():179] send: summary -2022-04-15 20:32:42,155 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:32:42,181 DEBUG HandlerThread:6751 [meta.py:_setup_git():207] setup git done -2022-04-15 20:32:42,181 DEBUG HandlerThread:6751 [meta.py:_save_code():89] save code -2022-04-15 20:32:42,212 DEBUG HandlerThread:6751 [meta.py:_save_code():110] save code done -2022-04-15 20:32:42,212 DEBUG HandlerThread:6751 [meta.py:_save_patches():127] save patches -2022-04-15 20:32:42,390 DEBUG HandlerThread:6751 [meta.py:_save_patches():169] save patches done -2022-04-15 20:32:42,390 DEBUG HandlerThread:6751 [meta.py:_save_pip():57] save pip -2022-04-15 20:32:42,391 DEBUG HandlerThread:6751 [meta.py:_save_pip():71] save pip done -2022-04-15 20:32:42,391 DEBUG HandlerThread:6751 [meta.py:_save_conda():78] save conda -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code -2022-04-15 20:32:47,042 DEBUG HandlerThread:6751 [meta.py:_save_conda():86] save conda done -2022-04-15 20:32:47,042 DEBUG HandlerThread:6751 [meta.py:probe():252] probe done -2022-04-15 20:32:47,048 DEBUG SenderThread:6751 [sender.py:send():179] send: files -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:32:47,070 DEBUG HandlerThread:6751 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 20:32:47,070 DEBUG SenderThread:6751 [sender.py:send_request():193] send_request: stop_status -2022-04-15 20:32:47,081 INFO MainThread:6751 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/output.log -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:32:47,138 INFO MainThread:6751 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:47,644 DEBUG SenderThread:6751 [sender.py:send():179] send: config -2022-04-15 20:32:48,685 INFO Thread-14 :6751 [upload_job.py:push():133] Uploaded file /tmp/tmpfzph_9yfwandb/a1uf7dt2-wandb-metadata.json diff --git a/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log b/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log deleted file mode 100644 index e0e86ab..0000000 --- a/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log +++ /dev/null @@ -1,41 +0,0 @@ -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/logs/debug.log -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/logs/debug-internal.log -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():369] calling init triggers -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:40,955 INFO MainThread:6751 [wandb_init.py:init():418] starting backend -2022-04-15 20:32:40,963 INFO MainThread:6751 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 20:32:40,973 INFO MainThread:6751 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 20:32:40,986 INFO wandb_internal:6751 [internal.py:wandb_internal():91] W&B internal server running at pid: 6751, started at: 2022-04-15 20:32:40.973711 -2022-04-15 20:32:40,989 INFO MainThread:6751 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:32:40,991 INFO MainThread:6751 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:32:41,002 INFO MainThread:6751 [wandb_init.py:init():484] communicating current version -2022-04-15 20:32:41,033 INFO WriterThread:6751 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:32:41,353 INFO MainThread:6751 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:32:42,068 INFO MainThread:6751 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:32:42,154 INFO SenderThread:6751 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files -2022-04-15 20:32:42,154 INFO SenderThread:6751 [sender.py:_start_run_threads():707] run started: 1bwp8j0o with start time 1650034960 -2022-04-15 20:32:42,155 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/requirements.txt -2022-04-15 20:32:43,129 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-summary.json -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code/train_translation.py -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/diff.patch -2022-04-15 20:32:43,130 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/code -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:32:47,048 INFO SenderThread:6751 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:32:47,081 INFO MainThread:6751 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:32:47,082 INFO MainThread:6751 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/conda-environment.yaml -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/output.log -2022-04-15 20:32:47,128 INFO Thread-12 :6751 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203240-1bwp8j0o/files/wandb-metadata.json -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:32:47,137 INFO MainThread:6751 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:32:47,138 INFO MainThread:6751 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-15 20:32:48,685 INFO Thread-14 :6751 [upload_job.py:push():133] Uploaded file /tmp/tmpfzph_9yfwandb/a1uf7dt2-wandb-metadata.json diff --git a/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb b/wandb/run-20220415_203240-1bwp8j0o/run-1bwp8j0o.wandb deleted file mode 100644 index e69de29..0000000 diff --git a/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py b/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py deleted file mode 100644 index a7a253c..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py +++ /dev/null @@ -1,401 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=4, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - - # wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print(out) - print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - -# for i in len(tgt_tokens): -# tgt_tokens[i] = id2bert[tgt_tokens[i]] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml b/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220415_203417-2injabwk/files/config.yaml b/wandb/run-20220415_203417-2injabwk/files/config.yaml deleted file mode 100644 index b88038a..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 4 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 1 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 1 diff --git a/wandb/run-20220415_203417-2injabwk/files/diff.patch b/wandb/run-20220415_203417-2injabwk/files/diff.patch deleted file mode 100644 index aba1e36..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/diff.patch +++ /dev/null @@ -1,30656 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..78b8901 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,72 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..a7a253c 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,16 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert, gpu) - predicted.append(out) - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print(out) -+ print(tokenizer.convert_ids_to_tokens(tgt_out)) - - try: - bleu_score(predicted, target) -@@ -375,7 +380,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +388,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+# for i in len(tgt_tokens): -+# tgt_tokens[i] = id2bert[tgt_tokens[i]] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..addd4fa 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220415_203417-2injabwk/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..b839e8d 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220415_203417-2injabwk/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..86c21fa 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220415_203417-2injabwk -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220415_203417-2injabwk/files/output.log b/wandb/run-20220415_203417-2injabwk/files/output.log deleted file mode 100644 index 451faa2..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/output.log +++ /dev/null @@ -1,65 +0,0 @@ - -train_translation.py -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -wandb: ERROR Internal wandb error: file data was not synced -Exception in thread Thread-15: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status - status_response = self._interface.communicate_stop_status() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status - resp = self._communicate(req, timeout=timeout, local=True) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate - return self._communicate_async(rec, local=local).get(timeout=timeout) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async - raise Exception("The wandb backend process has shutdown") -Exception: The wandb backend process has shutdown -Traceback (most recent call last): - File "", line 1, in - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main - exitcode = _main(fd) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main - return self._bootstrap() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap - threading._shutdown() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown - lock.acquire() -KeyboardInterrupt \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/files/requirements.txt b/wandb/run-20220415_203417-2injabwk/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json b/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json deleted file mode 100644 index 35794ce..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T15:04:19.477918", - "startedAt": "2022-04-15T15:04:17.866522", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json b/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json deleted file mode 100644 index 9e26dfe..0000000 --- a/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log b/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log deleted file mode 100644 index 4eaab20..0000000 --- a/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log +++ /dev/null @@ -1,100 +0,0 @@ -2022-04-15 20:34:17,894 INFO wandb_internal:6840 [internal.py:wandb_internal():91] W&B internal server running at pid: 6840, started at: 2022-04-15 20:34:17.893635 -2022-04-15 20:34:17,917 INFO MainThread:6840 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:34:17,917 DEBUG MainThread:6840 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-15 20:34:17,919 INFO MainThread:6840 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:34:17,937 INFO MainThread:6840 [wandb_init.py:init():484] communicating current version -2022-04-15 20:34:17,938 DEBUG SenderThread:6840 [sender.py:send():179] send: header -2022-04-15 20:34:17,938 INFO WriterThread:6840 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:17,940 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: check_version -2022-04-15 20:34:17,940 DEBUG SenderThread:6840 [sender.py:send_request():193] send_request: check_version -2022-04-15 20:34:18,241 INFO MainThread:6840 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:34:18,242 INFO MainThread:6840 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:34:18,244 DEBUG SenderThread:6840 [sender.py:send():179] send: run -2022-04-15 20:34:19,461 INFO MainThread:6840 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:34:19,462 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: run_start -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:__init__():39] meta init -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:__init__():53] meta init done -2022-04-15 20:34:19,477 DEBUG HandlerThread:6840 [meta.py:probe():210] probe -2022-04-15 20:34:19,503 DEBUG HandlerThread:6840 [meta.py:_setup_git():200] setup git -2022-04-15 20:34:19,521 INFO SenderThread:6840 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:19,522 INFO SenderThread:6840 [sender.py:_start_run_threads():707] run started: 2injabwk with start time 1650035057 -2022-04-15 20:34:19,522 DEBUG SenderThread:6840 [sender.py:send():179] send: summary -2022-04-15 20:34:19,523 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:34:19,589 DEBUG HandlerThread:6840 [meta.py:_setup_git():207] setup git done -2022-04-15 20:34:19,590 DEBUG HandlerThread:6840 [meta.py:_save_code():89] save code -2022-04-15 20:34:19,635 DEBUG HandlerThread:6840 [meta.py:_save_code():110] save code done -2022-04-15 20:34:19,635 DEBUG HandlerThread:6840 [meta.py:_save_patches():127] save patches -2022-04-15 20:34:19,863 DEBUG HandlerThread:6840 [meta.py:_save_patches():169] save patches done -2022-04-15 20:34:19,863 DEBUG HandlerThread:6840 [meta.py:_save_pip():57] save pip -2022-04-15 20:34:19,864 DEBUG HandlerThread:6840 [meta.py:_save_pip():71] save pip done -2022-04-15 20:34:19,864 DEBUG HandlerThread:6840 [meta.py:_save_conda():78] save conda -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code -2022-04-15 20:34:24,109 DEBUG HandlerThread:6840 [meta.py:_save_conda():86] save conda done -2022-04-15 20:34:24,109 DEBUG HandlerThread:6840 [meta.py:probe():252] probe done -2022-04-15 20:34:24,112 DEBUG SenderThread:6840 [sender.py:send():179] send: files -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:34:24,113 INFO SenderThread:6840 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:34:24,142 DEBUG HandlerThread:6840 [handler.py:handle_request():124] handle_request: stop_status -2022-04-15 20:34:24,142 DEBUG SenderThread:6840 [sender.py:send_request():193] send_request: stop_status -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:34:24,155 INFO MainThread:6840 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json -2022-04-15 20:34:24,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:24,850 DEBUG SenderThread:6840 [sender.py:send():179] send: config -2022-04-15 20:34:25,811 INFO Thread-14 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/w18tghfd-wandb-metadata.json -2022-04-15 20:34:25,876 INFO Thread-17 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/22s9hrau-code/train_translation.py -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:26,625 INFO Thread-19 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/1n34jtgp-diff.patch -2022-04-15 20:34:28,518 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:32,520 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:33,257 INFO SenderThread:6840 [sender.py:finish():933] shutting down sender -2022-04-15 20:34:33,257 INFO SenderThread:6840 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 20:34:33,257 INFO WriterThread:6840 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:33,520 INFO SenderThread:6840 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt requirements.txt -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json wandb-metadata.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log output.log -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml conda-environment.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json wandb-summary.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml config.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch diff.patch -2022-04-15 20:34:33,522 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py code/train_translation.py -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 20:34:35,046 INFO Thread-24 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:35,048 INFO Thread-27 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:35,101 INFO Thread-25 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:35,453 INFO Thread-26 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:35,455 INFO Thread-23 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:36,378 ERROR wandb_internal:6840 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,033 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,036 INFO MainThread:6840 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_203417-2injabwk/logs/debug.log b/wandb/run-20220415_203417-2injabwk/logs/debug.log deleted file mode 100644 index d999a97..0000000 --- a/wandb/run-20220415_203417-2injabwk/logs/debug.log +++ /dev/null @@ -1,85 +0,0 @@ -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/logs/debug.log -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/logs/debug-internal.log -2022-04-15 20:34:17,868 INFO MainThread:6840 [wandb_init.py:init():369] calling init triggers -2022-04-15 20:34:17,869 INFO MainThread:6840 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:17,869 INFO MainThread:6840 [wandb_init.py:init():418] starting backend -2022-04-15 20:34:17,879 INFO MainThread:6840 [backend.py:ensure_launched():132] starting backend process... -2022-04-15 20:34:17,893 INFO MainThread:6840 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-15 20:34:17,894 INFO wandb_internal:6840 [internal.py:wandb_internal():91] W&B internal server running at pid: 6840, started at: 2022-04-15 20:34:17.893635 -2022-04-15 20:34:17,917 INFO MainThread:6840 [wandb_init.py:init():423] backend started and connected -2022-04-15 20:34:17,919 INFO MainThread:6840 [wandb_init.py:init():465] updated telemetry -2022-04-15 20:34:17,937 INFO MainThread:6840 [wandb_init.py:init():484] communicating current version -2022-04-15 20:34:17,938 INFO WriterThread:6840 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:18,241 INFO MainThread:6840 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-15 20:34:18,242 INFO MainThread:6840 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-15 20:34:19,461 INFO MainThread:6840 [wandb_init.py:init():522] starting run threads in backend -2022-04-15 20:34:19,521 INFO SenderThread:6840 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:19,522 INFO SenderThread:6840 [sender.py:_start_run_threads():707] run started: 2injabwk with start time 1650035057 -2022-04-15 20:34:19,523 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:20,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:20,515 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-15 20:34:24,112 INFO SenderThread:6840 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-15 20:34:24,113 INFO SenderThread:6840 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_console_start():1538] atexit reg -2022-04-15 20:34:24,154 INFO MainThread:6840 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-15 20:34:24,155 INFO MainThread:6840 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_init.py:init():547] run started, returning control to user process -2022-04-15 20:34:24,227 INFO MainThread:6840 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:24,513 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json -2022-04-15 20:34:24,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:25,811 INFO Thread-14 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/w18tghfd-wandb-metadata.json -2022-04-15 20:34:25,876 INFO Thread-17 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/22s9hrau-code/train_translation.py -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:26,514 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:26,625 INFO Thread-19 :6840 [upload_job.py:push():133] Uploaded file /tmp/tmpdyry0x9pwandb/1n34jtgp-diff.patch -2022-04-15 20:34:28,518 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:32,520 INFO Thread-12 :6840 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:33,257 INFO SenderThread:6840 [sender.py:finish():933] shutting down sender -2022-04-15 20:34:33,257 INFO SenderThread:6840 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-15 20:34:33,257 INFO WriterThread:6840 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb -2022-04-15 20:34:33,520 INFO SenderThread:6840 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt requirements.txt -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-metadata.json wandb-metadata.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log output.log -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml conda-environment.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json wandb-summary.json -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml config.yaml -2022-04-15 20:34:33,521 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/diff.patch diff.patch -2022-04-15 20:34:33,522 INFO SenderThread:6840 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/code/train_translation.py code/train_translation.py -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:finish():176] shutting down file pusher -2022-04-15 20:34:33,522 INFO SenderThread:6840 [file_pusher.py:join():181] waiting for file pusher -2022-04-15 20:34:35,046 INFO Thread-24 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/output.log -2022-04-15 20:34:35,048 INFO Thread-27 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/config.yaml -2022-04-15 20:34:35,101 INFO Thread-25 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/conda-environment.yaml -2022-04-15 20:34:35,453 INFO Thread-26 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/wandb-summary.json -2022-04-15 20:34:35,455 INFO Thread-23 :6840 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220415_203417-2injabwk/files/requirements.txt -2022-04-15 20:34:36,378 ERROR wandb_internal:6840 [internal.py:wandb_internal():159] Thread HandlerThread: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run - self._run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run - record = self._input_record_q.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError -EOFError -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-15 20:46:13,288 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,033 INFO MainThread:6840 [wandb_run.py:_restore():1480] restore -2022-04-15 20:46:14,036 INFO MainThread:6840 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb b/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb deleted file mode 100644 index 3ae463a..0000000 Binary files a/wandb/run-20220415_203417-2injabwk/run-2injabwk.wandb and /dev/null differ diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py b/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py deleted file mode 100644 index 49b1b0a..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py +++ /dev/null @@ -1,402 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml b/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml b/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml deleted file mode 100644 index 4458c44..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml +++ /dev/null @@ -1,115 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 2: - - 1 - - 11 - 3: - - 2 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 5 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch b/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch deleted file mode 100644 index 33f52c4..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch +++ /dev/null @@ -1,30763 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..2b00de1 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,160 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..49b1b0a 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -49,7 +50,7 @@ parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') - parser.add_argument('--epochs', default=5, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,17 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ - - try: - bleu_score(predicted, target) -@@ -375,7 +381,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +389,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..5fd3d32 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_013009-2m8v6ch7/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..95199a3 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_013009-2m8v6ch7/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..f412bf7 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_013009-2m8v6ch7 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/output.log b/wandb/run-20220416_013009-2m8v6ch7/files/output.log deleted file mode 100644 index 21faf62..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/output.log +++ /dev/null @@ -1,25 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -translation model saved in checkpoint -{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -translation model saved in checkpoint -{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -translation model saved in checkpoint -{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -translation model saved in checkpoint -{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -translation model saved in checkpoint \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt b/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json b/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json deleted file mode 100644 index dbffe1f..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:00:10.396365", - "startedAt": "2022-04-15T20:00:09.148879", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json b/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json deleted file mode 100644 index 1fcb966..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 79.08950964609782, "_runtime": 195, "_timestamp": 1650053004, "_step": 5, "bleu_score": 0.0} \ No newline at end of file diff --git a/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log b/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log deleted file mode 100644 index 406d1ee..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log +++ /dev/null @@ -1,388 +0,0 @@ -2022-04-16 01:30:09,156 INFO wandb_internal:3047 [internal.py:wandb_internal():91] W&B internal server running at pid: 3047, started at: 2022-04-16 01:30:09.155690 -2022-04-16 01:30:09,157 INFO MainThread:3047 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:30:09,158 INFO MainThread:3047 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:30:09,158 DEBUG MainThread:3047 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:30:09,159 INFO MainThread:3047 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:30:09,160 INFO MainThread:3047 [wandb_init.py:init():484] communicating current version -2022-04-16 01:30:09,160 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:30:09,160 INFO WriterThread:3047 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:30:09,161 DEBUG SenderThread:3047 [sender.py:send():179] send: header -2022-04-16 01:30:09,162 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:30:09,594 INFO MainThread:3047 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:30:09,595 INFO MainThread:3047 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:30:09,595 DEBUG SenderThread:3047 [sender.py:send():179] send: run -2022-04-16 01:30:10,393 INFO MainThread:3047 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:30:10,393 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:30:10,394 INFO SenderThread:3047 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_start_run_threads():707] run started: 2m8v6ch7 with start time 1650052809 -2022-04-16 01:30:10,394 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:__init__():39] meta init -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:__init__():53] meta init done -2022-04-16 01:30:10,396 DEBUG HandlerThread:3047 [meta.py:probe():210] probe -2022-04-16 01:30:10,402 DEBUG HandlerThread:3047 [meta.py:_setup_git():200] setup git -2022-04-16 01:30:10,417 DEBUG HandlerThread:3047 [meta.py:_setup_git():207] setup git done -2022-04-16 01:30:10,417 DEBUG HandlerThread:3047 [meta.py:_save_code():89] save code -2022-04-16 01:30:10,424 DEBUG HandlerThread:3047 [meta.py:_save_code():110] save code done -2022-04-16 01:30:10,424 DEBUG HandlerThread:3047 [meta.py:_save_patches():127] save patches -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_patches():169] save patches done -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_pip():57] save pip -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_pip():71] save pip done -2022-04-16 01:30:10,560 DEBUG HandlerThread:3047 [meta.py:_save_conda():78] save conda -2022-04-16 01:30:11,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch -2022-04-16 01:30:11,400 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:30:11,401 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code -2022-04-16 01:30:13,396 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:13,402 DEBUG HandlerThread:3047 [meta.py:_save_conda():86] save conda done -2022-04-16 01:30:13,402 DEBUG HandlerThread:3047 [meta.py:probe():252] probe done -2022-04-16 01:30:13,405 DEBUG SenderThread:3047 [sender.py:send():179] send: files -2022-04-16 01:30:13,406 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:30:13,412 INFO MainThread:3047 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:30:13,415 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:13,415 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:30:13,415 INFO MainThread:3047 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:30:13,417 INFO MainThread:3047 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:14,156 DEBUG SenderThread:3047 [sender.py:send():179] send: config -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:15,121 INFO Thread-14 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1u7lv5wr-wandb-metadata.json -2022-04-16 01:30:15,209 INFO Thread-17 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/4wbr9a95-code/train_translation.py -2022-04-16 01:30:16,138 INFO Thread-22 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1f5szweq-diff.patch -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:30:18,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:25,465 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,470 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,660 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:30:27,660 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:30:27,660 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:28,591 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:29,157 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:29,157 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:30:39,019 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:30:43,595 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:44,867 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:30:44,867 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:00,710 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:00,710 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:09,489 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:31:16,370 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:16,370 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:24,719 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:31:24,719 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:31:24,720 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:25,608 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,028 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:31:31,029 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:31:31,029 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:31,609 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,610 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:32,032 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:32,032 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:31:40,142 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:31:43,612 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:47,765 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:31:47,765 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:03,456 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:03,456 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:17,464 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:32:17,464 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:32:17,466 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:19,176 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:19,176 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:33,638 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:34,812 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:34,812 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:32:50,521 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:32:50,521 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:05,050 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:33:05,050 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:05,052 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:06,206 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:06,206 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:09,011 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:33:19,651 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:21,889 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:21,889 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:23,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:24,091 DEBUG SenderThread:3047 [sender.py:send():179] send: history -2022-04-16 01:33:24,091 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:24,092 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:24,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:37,579 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:33:37,579 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:33:38,348 INFO MainThread:3047 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2m8v6ch7 -2022-04-16 01:33:38,349 INFO MainThread:3047 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 -2022-04-16 01:33:38,350 INFO MainThread:3047 [wandb_run.py:_restore():1480] restore -2022-04-16 01:33:38,674 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:38,941 DEBUG SenderThread:3047 [sender.py:send():179] send: telemetry -2022-04-16 01:33:38,943 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:38,943 DEBUG SenderThread:3047 [sender.py:send():179] send: exit -2022-04-16 01:33:38,944 INFO SenderThread:3047 [sender.py:send_exit():287] handling exit code: 0 -2022-04-16 01:33:38,944 INFO SenderThread:3047 [sender.py:send_exit():295] send defer -2022-04-16 01:33:38,944 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:38,946 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1744922 -} - -2022-04-16 01:33:38,948 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:38,948 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 0 -2022-04-16 01:33:38,949 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:38,949 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 0 -2022-04-16 01:33:38,949 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 1 -2022-04-16 01:33:38,950 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:38,950 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 1 -2022-04-16 01:33:39,026 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,026 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 1 -2022-04-16 01:33:39,026 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 2 -2022-04-16 01:33:39,027 DEBUG SenderThread:3047 [sender.py:send():179] send: stats -2022-04-16 01:33:39,027 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,027 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 2 -2022-04-16 01:33:39,027 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,027 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 2 -2022-04-16 01:33:39,027 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 3 -2022-04-16 01:33:39,028 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,028 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 3 -2022-04-16 01:33:39,028 DEBUG SenderThread:3047 [sender.py:send():179] send: summary -2022-04-16 01:33:39,028 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:39,028 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 3 -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 4 -2022-04-16 01:33:39,029 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,029 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 4 -2022-04-16 01:33:39,029 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,029 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 4 -2022-04-16 01:33:39,048 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:39,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:39,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:39,793 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 5 -2022-04-16 01:33:39,793 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:39,794 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:39,794 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 5 -2022-04-16 01:33:39,794 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:39,795 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 5 -2022-04-16 01:33:39,795 INFO SenderThread:3047 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:33:39,795 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1744922 -} - -2022-04-16 01:33:39,897 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:40,675 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:33:40,677 INFO SenderThread:3047 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:33:40,677 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt requirements.txt -2022-04-16 01:33:40,678 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:33:40,678 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log output.log -2022-04-16 01:33:40,679 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:33:40,679 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json wandb-summary.json -2022-04-16 01:33:40,690 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml config.yaml -2022-04-16 01:33:40,701 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch diff.patch -2022-04-16 01:33:40,730 INFO SenderThread:3047 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py code/train_translation.py -2022-04-16 01:33:40,730 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 6 -2022-04-16 01:33:40,731 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:40,732 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:40,733 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 6 -2022-04-16 01:33:40,734 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:40,734 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:40,735 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 6 -2022-04-16 01:33:40,735 INFO SenderThread:3047 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:33:40,735 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 7 -2022-04-16 01:33:40,736 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:40,737 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 7 -2022-04-16 01:33:40,737 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:40,737 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 7 -2022-04-16 01:33:40,835 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,471 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 8 -2022-04-16 01:33:41,471 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,473 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:41,473 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,474 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 8 -2022-04-16 01:33:41,475 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:41,475 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 8 -2022-04-16 01:33:41,476 INFO SenderThread:3047 [sender.py:send_request_defer():342] send defer: 9 -2022-04-16 01:33:41,477 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:33:41,478 DEBUG SenderThread:3047 [sender.py:send():179] send: final -2022-04-16 01:33:41,478 INFO HandlerThread:3047 [handler.py:handle_request_defer():141] handle defer: 9 -2022-04-16 01:33:41,478 DEBUG SenderThread:3047 [sender.py:send():179] send: footer -2022-04-16 01:33:41,479 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: defer -2022-04-16 01:33:41,480 INFO SenderThread:3047 [sender.py:send_request_defer():304] handle sender defer: 9 -2022-04-16 01:33:41,575 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,576 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,576 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,678 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,679 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,680 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1744922 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,781 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,782 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,783 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,885 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,886 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,887 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:41,989 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:41,990 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:41,991 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,092 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,092 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,093 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,194 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,195 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,196 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,277 INFO Thread-29 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:33:42,283 INFO Thread-30 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:42,286 INFO Thread-31 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:33:42,297 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,298 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,299 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,351 INFO Thread-32 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:42,365 INFO Thread-33 :3047 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:33:42,401 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,401 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,403 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,504 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,505 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,506 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,608 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:33:42,608 DEBUG SenderThread:3047 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:33:42,609 INFO SenderThread:3047 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:33:42,610 INFO MainThread:3047 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true -exit_result { -} -file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1754733 - total_bytes: 1754733 -} - -2022-04-16 01:33:42,611 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: get_summary -2022-04-16 01:33:42,613 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: sampled_history -2022-04-16 01:33:42,616 DEBUG HandlerThread:3047 [handler.py:handle_request():124] handle_request: shutdown -2022-04-16 01:33:42,617 INFO HandlerThread:3047 [handler.py:finish():638] shutting down handler -2022-04-16 01:33:43,478 INFO WriterThread:3047 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:33:43,609 INFO SenderThread:3047 [sender.py:finish():933] shutting down sender -2022-04-16 01:33:43,610 INFO SenderThread:3047 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:33:43,610 INFO SenderThread:3047 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:33:43,634 INFO MainThread:3047 [wandb_run.py:_show_summary():1785] rendering summary -2022-04-16 01:33:43,635 INFO MainThread:3047 [wandb_run.py:_show_history():1823] rendering history -2022-04-16 01:33:43,635 INFO MainThread:3047 [wandb_run.py:_show_files():1852] logging synced files -2022-04-16 01:33:43,661 INFO MainThread:3047 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log b/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log deleted file mode 100644 index 329a7e5..0000000 --- a/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log +++ /dev/null @@ -1,69 +0,0 @@ -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/logs/debug.log -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/logs/debug-internal.log -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:09,150 INFO MainThread:3047 [wandb_init.py:init():418] starting backend -2022-04-16 01:30:09,155 INFO MainThread:3047 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:30:09,156 INFO wandb_internal:3047 [internal.py:wandb_internal():91] W&B internal server running at pid: 3047, started at: 2022-04-16 01:30:09.155690 -2022-04-16 01:30:09,157 INFO MainThread:3047 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:30:09,158 INFO MainThread:3047 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:30:09,159 INFO MainThread:3047 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:30:09,160 INFO MainThread:3047 [wandb_init.py:init():484] communicating current version -2022-04-16 01:30:09,160 INFO WriterThread:3047 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb -2022-04-16 01:30:09,594 INFO MainThread:3047 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:30:09,595 INFO MainThread:3047 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:30:10,393 INFO MainThread:3047 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:30:10,394 INFO SenderThread:3047 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_start_run_threads():707] run started: 2m8v6ch7 with start time 1650052809 -2022-04-16 01:30:10,394 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:11,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/diff.patch -2022-04-16 01:30:11,400 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/requirements.txt -2022-04-16 01:30:11,401 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code/train_translation.py -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:11,402 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/code -2022-04-16 01:30:13,396 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/conda-environment.yaml -2022-04-16 01:30:13,406 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:30:13,407 INFO SenderThread:3047 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:30:13,412 INFO MainThread:3047 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:30:13,415 INFO MainThread:3047 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:30:13,417 INFO MainThread:3047 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:30:13,418 INFO MainThread:3047 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-metadata.json -2022-04-16 01:30:14,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:15,121 INFO Thread-14 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1u7lv5wr-wandb-metadata.json -2022-04-16 01:30:15,209 INFO Thread-17 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/4wbr9a95-code/train_translation.py -2022-04-16 01:30:16,138 INFO Thread-22 :3047 [upload_job.py:push():133] Uploaded file /tmp/tmp_xxxs0wowandb/1f5szweq-diff.patch -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:16,398 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/config.yaml -2022-04-16 01:30:18,399 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:25,465 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,470 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:30:27,660 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:30:28,591 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:30:43,595 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:24,720 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:25,608 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,029 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:31:31,609 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:31:31,610 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:31:43,612 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:17,466 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:32:17,622 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:32:33,638 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:05,052 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:05,647 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:19,651 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:23,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/output.log -2022-04-16 01:33:24,092 INFO SenderThread:3047 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:33:24,662 INFO Thread-11 :3047 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013009-2m8v6ch7/files/wandb-summary.json -2022-04-16 01:33:38,348 INFO MainThread:3047 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2m8v6ch7 diff --git a/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb b/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb deleted file mode 100644 index 4cd4d16..0000000 Binary files a/wandb/run-20220416_013009-2m8v6ch7/run-2m8v6ch7.wandb and /dev/null differ diff --git a/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py b/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py deleted file mode 100644 index ecaff5f..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py +++ /dev/null @@ -1,402 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml b/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_013544-2rw6cucs/files/config.yaml b/wandb/run-20220416_013544-2rw6cucs/files/config.yaml deleted file mode 100644 index d0bb2ba..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/config.yaml +++ /dev/null @@ -1,115 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 2: - - 1 - - 11 - 3: - - 2 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_013544-2rw6cucs/files/diff.patch b/wandb/run-20220416_013544-2rw6cucs/files/diff.patch deleted file mode 100644 index 569fe58..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/diff.patch +++ /dev/null @@ -1,30779 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..03d7a9b 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,173 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..ecaff5f 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,13 +327,17 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ - - try: - bleu_score(predicted, target) -@@ -375,7 +381,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +389,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..fdf4076 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_013544-2rw6cucs/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..83d0ecb 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_013544-2rw6cucs/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..923d2ad 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_013544-2rw6cucs -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_013544-2rw6cucs/files/output.log b/wandb/run-20220416_013544-2rw6cucs/files/output.log deleted file mode 100644 index 658db0f..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/output.log +++ /dev/null @@ -1,42 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -translation model saved in checkpoint -{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -translation model saved in checkpoint -{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -translation model saved in checkpoint -{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -translation model saved in checkpoint -{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -translation model saved in checkpoint -{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -translation model saved in checkpoint -{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -translation model saved in checkpoint -{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -translation model saved in checkpoint -{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -translation model saved in checkpoint -{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -translation model saved in checkpoint \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt b/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json b/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json deleted file mode 100644 index 9a29c9c..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:05:45.959756", - "startedAt": "2022-04-15T20:05:44.728209", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json b/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json deleted file mode 100644 index c14a271..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 86.59892717997234, "_runtime": 284, "_timestamp": 1650053428, "_step": 11, "bleu_score": 0.0} \ No newline at end of file diff --git a/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log b/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log deleted file mode 100644 index e841066..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log +++ /dev/null @@ -1,441 +0,0 @@ -2022-04-16 01:35:44,735 INFO wandb_internal:4584 [internal.py:wandb_internal():91] W&B internal server running at pid: 4584, started at: 2022-04-16 01:35:44.734800 -2022-04-16 01:35:44,735 INFO MainThread:4584 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:35:44,736 INFO MainThread:4584 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:35:44,737 DEBUG MainThread:4584 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():484] communicating current version -2022-04-16 01:35:44,739 DEBUG SenderThread:4584 [sender.py:send():179] send: header -2022-04-16 01:35:44,739 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:35:44,741 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:35:44,740 INFO WriterThread:4584 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:35:45,095 DEBUG SenderThread:4584 [sender.py:send():179] send: run -2022-04-16 01:35:45,945 INFO MainThread:4584 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:35:45,948 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:35:45,951 INFO SenderThread:4584 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:35:45,951 INFO SenderThread:4584 [sender.py:_start_run_threads():707] run started: 2rw6cucs with start time 1650053144 -2022-04-16 01:35:45,952 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:35:45,952 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:__init__():39] meta init -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:__init__():53] meta init done -2022-04-16 01:35:45,959 DEBUG HandlerThread:4584 [meta.py:probe():210] probe -2022-04-16 01:35:45,968 DEBUG HandlerThread:4584 [meta.py:_setup_git():200] setup git -2022-04-16 01:35:46,021 DEBUG HandlerThread:4584 [meta.py:_setup_git():207] setup git done -2022-04-16 01:35:46,022 DEBUG HandlerThread:4584 [meta.py:_save_code():89] save code -2022-04-16 01:35:46,039 DEBUG HandlerThread:4584 [meta.py:_save_code():110] save code done -2022-04-16 01:35:46,039 DEBUG HandlerThread:4584 [meta.py:_save_patches():127] save patches -2022-04-16 01:35:46,144 DEBUG HandlerThread:4584 [meta.py:_save_patches():169] save patches done -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_pip():57] save pip -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_pip():71] save pip done -2022-04-16 01:35:46,145 DEBUG HandlerThread:4584 [meta.py:_save_conda():78] save conda -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code -2022-04-16 01:35:47,657 DEBUG HandlerThread:4584 [meta.py:_save_conda():86] save conda done -2022-04-16 01:35:47,657 DEBUG HandlerThread:4584 [meta.py:probe():252] probe done -2022-04-16 01:35:47,659 DEBUG SenderThread:4584 [sender.py:send():179] send: files -2022-04-16 01:35:47,660 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:35:47,668 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:35:47,668 INFO MainThread:4584 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:35:47,669 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:48,344 DEBUG SenderThread:4584 [sender.py:send():179] send: config -2022-04-16 01:35:49,366 INFO Thread-14 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2pht4hd1-wandb-metadata.json -2022-04-16 01:35:49,466 INFO Thread-16 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/1v7xd8v7-code/train_translation.py -2022-04-16 01:35:49,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:50,313 INFO Thread-22 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2zhfst8q-diff.patch -2022-04-16 01:35:50,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:35:51,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:53,954 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:01,747 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:36:01,747 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:36:01,747 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:36:01,956 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:36:01,957 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:03,346 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:03,347 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:13,960 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:14,917 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:36:19,013 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:19,014 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:34,658 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:34,658 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:36:45,432 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:36:50,310 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:36:50,310 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:02,753 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:02,753 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:02,754 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:02,975 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:05,982 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:05,982 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:09,307 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:09,307 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:09,307 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:16,103 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:37:21,651 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:21,651 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:23,988 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:27,989 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:28,464 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:28,464 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:28,465 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:28,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:29,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:37,481 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:37,481 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:37:42,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,461 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:37:46,461 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:37:46,462 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:46,663 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:37:47,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:48,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:53,201 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:37:53,201 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:02,037 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:06,038 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:07,261 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:38:07,261 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:38:07,262 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:08,288 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:08,364 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:08,927 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:08,927 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:17,417 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:38:20,291 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,293 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,597 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:24,597 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:25,471 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:38:25,471 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:38:25,471 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:40,265 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:40,266 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:38:40,504 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:48,033 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:38:55,936 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:38:55,936 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:11,586 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:11,586 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:18,577 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:39:25,381 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:25,381 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:25,384 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:25,519 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:27,259 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:27,259 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:32,019 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:32,019 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:32,020 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:43,051 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:43,052 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:39:44,548 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:48,550 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:49,332 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:39:49,589 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:39:49,589 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:39:49,589 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:50,604 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:50,605 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:58,737 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:39:58,738 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:04,608 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:08,350 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:40:08,350 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:08,350 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:14,447 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:40:14,447 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:20,107 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:40:24,614 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:28,328 DEBUG SenderThread:4584 [sender.py:send():179] send: history -2022-04-16 01:40:28,328 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:28,328 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:30,122 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:40:30,122 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:40:42,389 INFO MainThread:4584 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2rw6cucs -2022-04-16 01:40:42,390 INFO MainThread:4584 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 -2022-04-16 01:40:42,391 INFO MainThread:4584 [wandb_run.py:_restore():1480] restore -2022-04-16 01:40:43,356 DEBUG SenderThread:4584 [sender.py:send():179] send: telemetry -2022-04-16 01:40:43,357 DEBUG SenderThread:4584 [sender.py:send():179] send: exit -2022-04-16 01:40:43,357 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:43,358 INFO SenderThread:4584 [sender.py:send_exit():287] handling exit code: 0 -2022-04-16 01:40:43,358 INFO SenderThread:4584 [sender.py:send_exit():295] send defer -2022-04-16 01:40:43,359 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:43,360 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,361 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 0 -2022-04-16 01:40:43,361 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,361 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 0 -2022-04-16 01:40:43,362 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 1 -2022-04-16 01:40:43,363 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,363 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 1 -2022-04-16 01:40:43,363 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1745897 -} - -2022-04-16 01:40:43,436 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,436 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 1 -2022-04-16 01:40:43,436 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 2 -2022-04-16 01:40:43,437 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,437 DEBUG SenderThread:4584 [sender.py:send():179] send: stats -2022-04-16 01:40:43,437 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 2 -2022-04-16 01:40:43,437 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,437 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 2 -2022-04-16 01:40:43,437 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 3 -2022-04-16 01:40:43,438 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,438 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 3 -2022-04-16 01:40:43,438 DEBUG SenderThread:4584 [sender.py:send():179] send: summary -2022-04-16 01:40:43,438 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:43,439 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 3 -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 4 -2022-04-16 01:40:43,439 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:43,439 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 4 -2022-04-16 01:40:43,439 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:43,439 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 4 -2022-04-16 01:40:43,465 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:43,631 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:43,632 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:44,194 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 5 -2022-04-16 01:40:44,194 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:44,196 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,196 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 5 -2022-04-16 01:40:44,196 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 2 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1745897 -} - -2022-04-16 01:40:44,197 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,197 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 5 -2022-04-16 01:40:44,198 INFO SenderThread:4584 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:40:44,298 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:44,632 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:40:44,634 INFO SenderThread:4584 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:40:44,634 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt requirements.txt -2022-04-16 01:40:44,635 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:40:44,635 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log output.log -2022-04-16 01:40:44,642 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json wandb-summary.json -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml config.yaml -2022-04-16 01:40:44,644 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch diff.patch -2022-04-16 01:40:44,646 INFO SenderThread:4584 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py code/train_translation.py -2022-04-16 01:40:44,646 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 6 -2022-04-16 01:40:44,647 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:44,647 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,647 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 6 -2022-04-16 01:40:44,649 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,649 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 6 -2022-04-16 01:40:44,649 INFO SenderThread:4584 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:40:44,649 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 7 -2022-04-16 01:40:44,651 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:44,651 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 7 -2022-04-16 01:40:44,651 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:44,651 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:44,652 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 7 -2022-04-16 01:40:44,753 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,419 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 8 -2022-04-16 01:40:45,420 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,421 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:45,421 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 8 -2022-04-16 01:40:45,422 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:45,422 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,423 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 8 -2022-04-16 01:40:45,424 INFO SenderThread:4584 [sender.py:send_request_defer():342] send defer: 9 -2022-04-16 01:40:45,426 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: defer -2022-04-16 01:40:45,426 INFO HandlerThread:4584 [handler.py:handle_request_defer():141] handle defer: 9 -2022-04-16 01:40:45,426 DEBUG SenderThread:4584 [sender.py:send():179] send: final -2022-04-16 01:40:45,427 DEBUG SenderThread:4584 [sender.py:send():179] send: footer -2022-04-16 01:40:45,427 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: defer -2022-04-16 01:40:45,427 INFO SenderThread:4584 [sender.py:send_request_defer():304] handle sender defer: 9 -2022-04-16 01:40:45,524 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,525 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,526 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1745897 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,627 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,628 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,629 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1750664 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,730 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,731 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,732 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,834 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,835 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,836 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:45,938 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:45,940 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:45,942 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,043 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,044 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,045 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,147 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,148 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,149 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,218 INFO Thread-35 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:40:46,227 INFO Thread-37 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:40:46,246 INFO Thread-36 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:46,250 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,253 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,255 INFO Thread-38 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:46,257 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,272 INFO Thread-39 :4584 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:40:46,358 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,359 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,360 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,462 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,462 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,464 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,565 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: poll_exit -2022-04-16 01:40:46,566 DEBUG SenderThread:4584 [sender.py:send_request():193] send_request: poll_exit -2022-04-16 01:40:46,566 INFO SenderThread:4584 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:40:46,567 INFO MainThread:4584 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true -exit_result { -} -file_counts { - wandb_count: 7 - other_count: 1 -} -pusher_stats { - uploaded_bytes: 1756683 - total_bytes: 1756683 -} - -2022-04-16 01:40:46,569 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: get_summary -2022-04-16 01:40:46,571 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: sampled_history -2022-04-16 01:40:46,575 DEBUG HandlerThread:4584 [handler.py:handle_request():124] handle_request: shutdown -2022-04-16 01:40:46,575 INFO HandlerThread:4584 [handler.py:finish():638] shutting down handler -2022-04-16 01:40:47,428 INFO WriterThread:4584 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:40:47,567 INFO SenderThread:4584 [sender.py:finish():933] shutting down sender -2022-04-16 01:40:47,567 INFO SenderThread:4584 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:40:47,567 INFO SenderThread:4584 [file_pusher.py:join():181] waiting for file pusher -2022-04-16 01:40:47,579 INFO MainThread:4584 [wandb_run.py:_show_summary():1785] rendering summary -2022-04-16 01:40:47,579 INFO MainThread:4584 [wandb_run.py:_show_history():1823] rendering history -2022-04-16 01:40:47,580 INFO MainThread:4584 [wandb_run.py:_show_files():1852] logging synced files -2022-04-16 01:40:47,627 INFO MainThread:4584 [internal.py:handle_exit():78] Internal process exited diff --git a/wandb/run-20220416_013544-2rw6cucs/logs/debug.log b/wandb/run-20220416_013544-2rw6cucs/logs/debug.log deleted file mode 100644 index e2cfa8d..0000000 --- a/wandb/run-20220416_013544-2rw6cucs/logs/debug.log +++ /dev/null @@ -1,96 +0,0 @@ -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/logs/debug.log -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/logs/debug-internal.log -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:44,729 INFO MainThread:4584 [wandb_init.py:init():418] starting backend -2022-04-16 01:35:44,734 INFO MainThread:4584 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:35:44,735 INFO wandb_internal:4584 [internal.py:wandb_internal():91] W&B internal server running at pid: 4584, started at: 2022-04-16 01:35:44.734800 -2022-04-16 01:35:44,735 INFO MainThread:4584 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:35:44,736 INFO MainThread:4584 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:35:44,738 INFO MainThread:4584 [wandb_init.py:init():484] communicating current version -2022-04-16 01:35:44,740 INFO WriterThread:4584 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:35:45,091 INFO MainThread:4584 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:35:45,945 INFO MainThread:4584 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:35:45,951 INFO SenderThread:4584 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files -2022-04-16 01:35:45,951 INFO SenderThread:4584 [sender.py:_start_run_threads():707] run started: 2rw6cucs with start time 1650053144 -2022-04-16 01:35:45,952 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/diff.patch -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code/train_translation.py -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/requirements.txt -2022-04-16 01:35:46,952 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/code -2022-04-16 01:35:47,660 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:35:47,661 INFO SenderThread:4584 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:35:47,668 INFO MainThread:4584 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:35:47,670 INFO MainThread:4584 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:35:47,671 INFO MainThread:4584 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/conda-environment.yaml -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-metadata.json -2022-04-16 01:35:47,951 INFO Thread-11 :4584 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:49,366 INFO Thread-14 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2pht4hd1-wandb-metadata.json -2022-04-16 01:35:49,466 INFO Thread-16 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/1v7xd8v7-code/train_translation.py -2022-04-16 01:35:49,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:50,313 INFO Thread-22 :4584 [upload_job.py:push():133] Uploaded file /tmp/tmp43zrqffgwandb/2zhfst8q-diff.patch -2022-04-16 01:35:50,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/config.yaml -2022-04-16 01:35:51,953 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:35:53,954 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:01,747 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:36:01,956 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:36:01,957 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:36:13,960 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:02,754 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:02,975 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,307 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:09,982 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:23,988 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:27,989 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:28,465 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:28,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:29,992 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:42,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,029 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:37:46,462 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:37:47,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:37:48,033 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:02,037 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:06,038 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:07,262 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:08,288 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:08,364 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:20,291 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:24,293 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:25,471 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:38:26,500 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:38:40,504 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:25,384 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:25,519 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,020 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:32,545 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:44,548 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:48,550 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:39:49,589 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:39:50,604 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:39:50,605 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:04,608 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:08,350 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:08,610 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:24,614 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:28,328 INFO SenderThread:4584 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/wandb-summary.json -2022-04-16 01:40:28,621 INFO Thread-11 :4584 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_013544-2rw6cucs/files/output.log -2022-04-16 01:40:42,389 INFO MainThread:4584 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2rw6cucs diff --git a/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb b/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb deleted file mode 100644 index f34d5f1..0000000 Binary files a/wandb/run-20220416_013544-2rw6cucs/run-2rw6cucs.wandb and /dev/null differ diff --git a/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py b/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py deleted file mode 100644 index 245e045..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py +++ /dev/null @@ -1,405 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print('out', out) - print('predicted', tgt_out) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml b/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_014133-qw6te5do/files/config.yaml b/wandb/run-20220416_014133-qw6te5do/files/config.yaml deleted file mode 100644 index 52b4100..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_014133-qw6te5do/files/diff.patch b/wandb/run-20220416_014133-qw6te5do/files/diff.patch deleted file mode 100644 index 290700b..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/diff.patch +++ /dev/null @@ -1,30813 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..f8b257c 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,198 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -+{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -+{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -+{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -+{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -+{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -+{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -+{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -+{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -+{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -+{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -+{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -+{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..245e045 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,19 +327,26 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print('out', out) -+ print('predicted', tgt_out) -+ - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() -+ - - bleu = bleu_score(predicted, target) - -@@ -375,7 +384,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +392,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..267a045 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_014133-qw6te5do/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..2534ff1 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_014133-qw6te5do/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..659d09a 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_014133-qw6te5do -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_014133-qw6te5do/files/output.log b/wandb/run-20220416_014133-qw6te5do/files/output.log deleted file mode 100644 index 2515324..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/output.log +++ /dev/null @@ -1,90 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 5} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 6} -translation model saved in checkpoint -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 13054, 10108, 37727, 10104, 10372, 11913, 10127, 11053, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10125, 10103, 29263, 11280, 21152, 10108, 10103, 16451, 14086, 117, - 11312, 14693, 10173, 54633, 10150, 10110, 29605, 10142, 10104, 10103, - 11134, 13896, 11523, 14650, 10346, 10103, 15152, 10139, 14299, 57616, - 14666, 131, 10103, 20202, 117, 12851, 37727, 10110, 45430, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([21113, 10127, 143, 12050, 11913, 10139, 24850, 119, 102], - device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([79481, 11229, 10346, 14356, 20550, 10139, 29785, 14262, 10110, 42136, - 12090, 32837, 10104, 13214, 10982, 16993, 52378, 10320, 85197, 10285, - 71132, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10844, 10104, 10103, 22151, 13170, 117, 79481, 11229, 10346, 14356, - 20550, 10139, 10144, 28194, 23209, 10108, 10103, 22389, 10472, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([123, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([124, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([43959, 10139, 13498, 117, 11497, 10110, 13044, 19394, 10107, 14975, - 10551, 40127, 11229, 10346, 19164, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([21113, 112, 161, 12763, 16894, 10438, 31377, 47461, 10563, 10104, - 16769, 10868, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([69066, 10139, 10103, 19569, 10110, 10103, 96237, 14650, 14989, 22107, - 57616, 10104, 10367, 20532, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 35040, 11312, 55257, 10104, 10103, 12878, 10916, 10868, 11229, - 19524, 10487, 11982, 10125, 57616, 10104, 10372, 11913, 10770, 10103, - 29468, 10114, 10695, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10104, 10372, 27195, 117, 10103, 35458, 10108, 12851, 37727, 117, - 45430, 117, 143, 12050, 14149, 19569, 10110, 13293, 11168, 24264, - 11229, 10346, 14758, 17156, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([ 143, 33533, 10108, 10103, 73444, 18116, 118, 22389, 17593, 10104, - 10595, 10151, 124, 43689, 12819, 11229, 10346, 21509, 10104, 10103, - 10403, 11125, 10139, 12851, 118, 10573, 31176, 119, 102], - device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 11594, 112, 161, 22853, 13651, 59343, 10114, 32097, 52958, - 10203, 29263, 68350, 10107, 13208, 13594, 36616, 14094, 19382, 10125, - 10103, 19569, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([15929, 44909, 77949, 10503, 12325, 10103, 12485, 10285, 12238, 14650, - 10346, 31377, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10103, 34763, 10127, 10114, 85270, 65343, 10218, 11497, 10110, 53938, - 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([42416, 72829, 10108, 10246, 18454, 76601, 11229, 10346, 17200, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([34321, 11229, 10346, 15227, 10114, 30562, 10103, 18785, 95044, 12705, - 10108, 10103, 11481, 34029, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([ 125, 119, 15636, 10110, 35054, 11229, 14989, 22107, 11232, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10197, 11229, 10346, 78832, 10171, 22418, 14856, 10110, 21516, 19771, - 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10668, 10348, 118, 14370, 12325, 11865, 10110, 12077, 10127, 19641, - 43131, 12652, 119, 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([35645, 42888, 10123, 14358, 10104, 14149, 10287, 10110, 27089, 14194, - 12315, 11229, 11923, 10144, 12652, 11892, 10104, 10372, 27195, 119, - 102], device='cuda:0') -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted tensor([10770, 10372, 22151, 13170, 117, 33189, 10125, 143, 23676, 14463, - 10108, 10482, 28781, 10171, 11498, 11229, 10346, 41755, 22117, 119, diff --git a/wandb/run-20220416_014133-qw6te5do/files/requirements.txt b/wandb/run-20220416_014133-qw6te5do/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json b/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json deleted file mode 100644 index 9966d97..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:11:34.454604", - "startedAt": "2022-04-15T20:11:33.272426", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json b/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json deleted file mode 100644 index b7216e0..0000000 --- a/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 137.94474399089813, "_runtime": 15, "_timestamp": 1650053508, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log b/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log deleted file mode 100644 index a91c8d3..0000000 --- a/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log +++ /dev/null @@ -1,84 +0,0 @@ -2022-04-16 01:41:33,302 INFO MainThread:6469 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:41:33,304 INFO wandb_internal:6469 [internal.py:wandb_internal():91] W&B internal server running at pid: 6469, started at: 2022-04-16 01:41:33.301961 -2022-04-16 01:41:33,305 INFO MainThread:6469 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:41:33,305 DEBUG MainThread:6469 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:41:33,307 INFO MainThread:6469 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:41:33,309 INFO MainThread:6469 [wandb_init.py:init():484] communicating current version -2022-04-16 01:41:33,312 INFO WriterThread:6469 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb -2022-04-16 01:41:33,314 DEBUG SenderThread:6469 [sender.py:send():179] send: header -2022-04-16 01:41:33,314 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:41:33,315 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:41:33,652 INFO MainThread:6469 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:41:33,653 INFO MainThread:6469 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:41:33,656 DEBUG SenderThread:6469 [sender.py:send():179] send: run -2022-04-16 01:41:34,451 INFO MainThread:6469 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:41:34,451 INFO SenderThread:6469 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files -2022-04-16 01:41:34,451 INFO SenderThread:6469 [sender.py:_start_run_threads():707] run started: qw6te5do with start time 1650053493 -2022-04-16 01:41:34,451 DEBUG SenderThread:6469 [sender.py:send():179] send: summary -2022-04-16 01:41:34,452 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:41:34,452 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:__init__():39] meta init -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:__init__():53] meta init done -2022-04-16 01:41:34,454 DEBUG HandlerThread:6469 [meta.py:probe():210] probe -2022-04-16 01:41:34,460 DEBUG HandlerThread:6469 [meta.py:_setup_git():200] setup git -2022-04-16 01:41:34,480 DEBUG HandlerThread:6469 [meta.py:_setup_git():207] setup git done -2022-04-16 01:41:34,481 DEBUG HandlerThread:6469 [meta.py:_save_code():89] save code -2022-04-16 01:41:34,489 DEBUG HandlerThread:6469 [meta.py:_save_code():110] save code done -2022-04-16 01:41:34,489 DEBUG HandlerThread:6469 [meta.py:_save_patches():127] save patches -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_patches():169] save patches done -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_pip():57] save pip -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_pip():71] save pip done -2022-04-16 01:41:34,554 DEBUG HandlerThread:6469 [meta.py:_save_conda():78] save conda -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/requirements.txt -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/diff.patch -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code -2022-04-16 01:41:36,139 DEBUG HandlerThread:6469 [meta.py:_save_conda():86] save conda done -2022-04-16 01:41:36,139 DEBUG HandlerThread:6469 [meta.py:probe():252] probe done -2022-04-16 01:41:36,141 DEBUG SenderThread:6469 [sender.py:send():179] send: files -2022-04-16 01:41:36,141 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:41:36,149 INFO MainThread:6469 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:41:36,150 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:41:36,150 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:41:36,150 INFO MainThread:6469 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:41:36,151 INFO MainThread:6469 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json -2022-04-16 01:41:36,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:36,837 DEBUG SenderThread:6469 [sender.py:send():179] send: config -2022-04-16 01:41:37,884 INFO Thread-14 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wnhls28-wandb-metadata.json -2022-04-16 01:41:38,099 INFO Thread-15 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wpqbnqv-code/train_translation.py -2022-04-16 01:41:38,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:38,856 INFO Thread-22 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2jnxx1qb-diff.patch -2022-04-16 01:41:39,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/config.yaml -2022-04-16 01:41:40,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:48,279 DEBUG SenderThread:6469 [sender.py:send():179] send: history -2022-04-16 01:41:48,279 DEBUG SenderThread:6469 [sender.py:send():179] send: summary -2022-04-16 01:41:48,279 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:50,462 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:51,840 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:41:51,840 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:42:02,488 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:03,061 DEBUG SenderThread:6469 [sender.py:send():179] send: stats -2022-04-16 01:42:06,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:07,554 DEBUG HandlerThread:6469 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:42:07,554 DEBUG SenderThread:6469 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:42:08,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:10,490 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:12,491 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:14,492 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_restore():1480] restore -2022-04-16 01:42:15,200 INFO SenderThread:6469 [sender.py:finish():933] shutting down sender -2022-04-16 01:42:15,200 INFO SenderThread:6469 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:42:15,200 INFO WriterThread:6469 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb diff --git a/wandb/run-20220416_014133-qw6te5do/logs/debug.log b/wandb/run-20220416_014133-qw6te5do/logs/debug.log deleted file mode 100644 index 76ddcd1..0000000 --- a/wandb/run-20220416_014133-qw6te5do/logs/debug.log +++ /dev/null @@ -1,61 +0,0 @@ -2022-04-16 01:41:33,278 INFO MainThread:6469 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:41:33,278 INFO MainThread:6469 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/logs/debug.log -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/logs/debug-internal.log -2022-04-16 01:41:33,279 INFO MainThread:6469 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:41:33,280 INFO MainThread:6469 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:33,280 INFO MainThread:6469 [wandb_init.py:init():418] starting backend -2022-04-16 01:41:33,301 INFO MainThread:6469 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:41:33,302 INFO MainThread:6469 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:41:33,304 INFO wandb_internal:6469 [internal.py:wandb_internal():91] W&B internal server running at pid: 6469, started at: 2022-04-16 01:41:33.301961 -2022-04-16 01:41:33,305 INFO MainThread:6469 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:41:33,307 INFO MainThread:6469 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:41:33,309 INFO MainThread:6469 [wandb_init.py:init():484] communicating current version -2022-04-16 01:41:33,312 INFO WriterThread:6469 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb -2022-04-16 01:41:33,652 INFO MainThread:6469 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:41:33,653 INFO MainThread:6469 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:41:34,451 INFO MainThread:6469 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:41:34,451 INFO SenderThread:6469 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files -2022-04-16 01:41:34,451 INFO SenderThread:6469 [sender.py:_start_run_threads():707] run started: qw6te5do with start time 1650053493 -2022-04-16 01:41:34,452 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:35,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code/train_translation.py -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/requirements.txt -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/diff.patch -2022-04-16 01:41:35,453 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/code -2022-04-16 01:41:36,141 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:41:36,142 INFO SenderThread:6469 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:41:36,149 INFO MainThread:6469 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:41:36,150 INFO MainThread:6469 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:41:36,151 INFO MainThread:6469 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:41:36,152 INFO MainThread:6469 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/conda-environment.yaml -2022-04-16 01:41:36,451 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-metadata.json -2022-04-16 01:41:36,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:37,884 INFO Thread-14 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wnhls28-wandb-metadata.json -2022-04-16 01:41:38,099 INFO Thread-15 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2wpqbnqv-code/train_translation.py -2022-04-16 01:41:38,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:38,856 INFO Thread-22 :6469 [upload_job.py:push():133] Uploaded file /tmp/tmp7a2m2v__wandb/2jnxx1qb-diff.patch -2022-04-16 01:41:39,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/config.yaml -2022-04-16 01:41:40,452 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:48,279 INFO SenderThread:6469 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/wandb-summary.json -2022-04-16 01:41:48,461 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:41:50,462 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:02,488 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:06,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:08,489 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:10,490 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:12,491 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:14,492 INFO Thread-11 :6469 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/files/output.log -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:42:15,200 INFO MainThread:6469 [wandb_run.py:_restore():1480] restore -2022-04-16 01:42:15,200 INFO SenderThread:6469 [sender.py:finish():933] shutting down sender -2022-04-16 01:42:15,200 INFO SenderThread:6469 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:42:15,200 INFO WriterThread:6469 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb diff --git a/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb b/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb deleted file mode 100644 index ff87007..0000000 Binary files a/wandb/run-20220416_014133-qw6te5do/run-qw6te5do.wandb and /dev/null differ diff --git a/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py b/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py deleted file mode 100644 index a5d5e46..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py +++ /dev/null @@ -1,405 +0,0 @@ -import numpy as np -from pathlib import Path -import argparse -import json -import math -import os -import random -import signal -import subprocess -import sys -import time - -import torch -from torch import nn, optim -from torch.nn import Transformer -import torchtext -import t_dataset -from t_dataset import Translation_dataset_t -from t_dataset import MyCollate -import translation_utils -from translation_utils import TokenEmbedding, PositionalEncoding -from translation_utils import create_mask -from transformers import BertModel -from transformers import AutoTokenizer -from torch import Tensor -from torchtext.data.metrics import bleu_score -from models import Translator -from models import BarlowTwins - -import wandb - - -#import barlow -os.environ['TRANSFORMERS_OFFLINE'] = 'yes' -os.environ['WANDB_START_METHOD'] = 'thread' -os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - -MANUAL_SEED = 4444 - -random.seed(MANUAL_SEED) -np.random.seed(MANUAL_SEED) -torch.manual_seed(MANUAL_SEED) -torch.backends.cudnn.deterministic = True - - -parser = argparse.ArgumentParser(description = 'Translation') - -# Training hyper-parameters: -parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') -parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') -parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -parser.add_argument('--dropout', default=0.01, type=float, metavar='d', - help='dropout for training translation transformer') -parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', - help='weight decay') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum for sgd') -parser.add_argument('--clip', default=1, type=float, metavar='GC', - help='Gradient Clipping') -parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', - help='betas for Adam Optimizer') -parser.add_argument('--eps', default=1e-9, type=float, metavar='E', - help='eps for Adam optimizer') -parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', - help='loss function for translation') -parser.add_argument('--optimizer', default='adam', type=str, metavar='OP', - help='selecting optimizer') - -# Transformer parameters: -parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') -parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') -parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') -parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') -parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') - -# Tokenizer: -parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, - metavar='T', help= 'tokenizer') -parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', - help='Dimension of mbert output') -# Paths: -parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, - metavar='DIR', help='path to checkpoint directory') - -# to load or barlow or not: -parser.add_argument('--load', default=0, type=int, - metavar='DIR', help='to load barlow twins encoder or not') - -# calculate bleu: -parser.add_argument('--checkbleu', default=5 , type=int, - metavar='BL', help='check bleu after these number of epochs') -# train or test dataset -parser.add_argument('--train', default=True , type=bool, - metavar='T', help='selecting train set') - -parser.add_argument('--print_freq', default=5 , type=int, - metavar='PF', help='frequency of printing and saving stats') - -parser.add_argument('--test_translation', default=0, type=int, - metavar='TT', help='testing translation_score') -''' NOTE: - Transformer and tokenizer arguments would remain constant in training and context enhancement step. -''' - -args = parser.parse_args() -# print(args.load) -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -def main(): - - # print("entered main") - args.ngpus_per_node = torch.cuda.device_count() - if 'SLURM_JOB_ID' in os.environ: - # single-node and multi-node distributed training on SLURM cluster - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - # find a common host name on all nodes - # assume scontrol returns hosts in the same order on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node - args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node - args.dist_url = f'tcp://{host_name}:58472' - else: - # single-node distributed training - args.rank = 0 - args.dist_url = 'tcp://localhost:58472' - args.world_size = args.ngpus_per_node - torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) - - -def main_worker(gpu, args): - - args.rank += gpu - torch.distributed.init_process_group( - backend='nccl', init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - - if args.rank == 0: - - wandb.init(config=args, project='translation_test')############################################# - wandb.config.update(args) - config = wandb.config - - # exit() - args.checkpoint_dir.mkdir(parents=True, exist_ok=True) - stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) - print(' '.join(sys.argv)) - print(' '.join(sys.argv), file=stats_file) - - torch.cuda.set_device(gpu) - torch.backends.cudnn.benchmark = True - - dataset = Translation_dataset_t(train=args.train) - src_vocab_size = dataset.de_vocab_size - trg_vocab_size = dataset.en_vocab_size - tokenizer = dataset.tokenizer - pad_idx = tokenizer.pad_token_id - sos_idx = tokenizer.cls_token_id - eos_idx = tokenizer.sep_token_id - -# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) - # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) - # print(src_vocab_size, trg_vocab_size) - mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') - transformer = Transformer(d_model=args.dmodel, - nhead=args.nhead, - num_encoder_layers=args.nlayers, - num_decoder_layers = args.nlayers, - dim_feedforward=args.dfeedforward, - dropout=args.dropout) - model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) - # print(model.state_dict) -# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) - - # args.load = False - - if args.load == 1: - # print(args.load) - # print('inside') - print('loading barlow model') - t_enc = model.transformer.encoder - barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) - ### note: lambd is just a placeholder - ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', - map_location='cpu') - barlow.load_state_dict(ckpt['model']) - model.transformer.encoder = barlow.transformer_enc - model.mbert = barlow.mbert - ''' - to_do: - if post_train: - torch.load(model.states_dict) - model.transformer.encoder = model_barlow - - ''' -# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - - param_weights = [] - param_biases = [] - for param in model.parameters(): - if param.ndim == 1: - param_biases.append(param) - else: - param_weights.append(param) - parameters = [{'params': param_weights}, {'params': param_biases}] - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) - -########################################################### - if args.optimizer == 'adam': - optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) - else: - optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) - - if args.loss_fn == 'cross_entropy': - loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) -############################################################## - - start_epoch = 0 - - sampler = torch.utils.data.distributed.DistributedSampler(dataset) - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size - id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - - test_loader = torch.utils.data.DataLoader( - dataset, batch_size=1, num_workers=args.workers, - pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) - ############################# - start_time = time.time() - - - if not args.test_translation: - - for epoch in range(start_epoch, args.epochs): - sampler.set_epoch(epoch) - epoch_loss = 0 - t = 0 - for step, (sent) in enumerate(loader, start=epoch * len(loader)): - src = sent[0].cuda(gpu, non_blocking=True) - tgt_inp = sent[2].cuda(gpu, non_blocking=True) - tgt_out = sent[3].cuda(gpu, non_blocking=True) - - src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) - logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) - - optimizer.zero_grad() - - loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) - loss.backward() - - optimizer.step() - # losses += loss.item() - -# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) - - if step % args.print_freq == 0: - if args.rank == 0: - stats = dict(epoch=epoch, step=step, - loss=loss.item(), - time=int(time.time() - start_time)) - print(json.dumps(stats)) - print(json.dumps(stats), file=stats_file) - if args.rank == 0: - - wandb.log({"epoch_loss":epoch_loss/t}) - # save checkpoint - state = dict(epoch=epoch + 1, model=model.module.state_dict(), - optimizer=optimizer.state_dict()) - # print(model.state_dict) - torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') - print('translation model saved in', args.checkpoint_dir) - - ############################################################## - if args.rank == 0: - if epoch%args.checkbleu ==0 : - - bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## - # if epoch%1 ==0 : - # torch.save(model.module.state_dict(), - # 'path.pth') - # print("Model is saved") - # if args.rank == 0: - # # save checkpoint - # state = dict(epoch=epoch + 1, model=model.state_dict(), - # optimizer=optimizer.state_dict()) - # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') - # print('saved translation model in', args.checkpoint_dir) - wandb.finish() - - else: - - bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - -def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] - target=[] - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) - out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) - for i in range(len(tgt_out)): - tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) - print('out', out) - print('predicted', tokenizer.convert_ids_to_tokens(tgt_out)) - - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() - - - bleu = bleu_score(predicted, target) - - return bleu - -''' -todo: - BLEU score -''' - -# function to generate output sequence using greedy algorithm -def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - src = src - src_mask = src_mask - - memory = model.module.encode(src, src_mask) - ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) - for i in range(max_len-1): - memory = memory - tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) - .type(torch.bool)).cuda(gpu, non_blocking=True) - out = model.module.decode(ys, memory, tgt_mask) - out = out.transpose(0, 1) - prob = model.module.generator(out[:, -1]) - _, next_word = torch.max(prob, dim=1) - next_word = next_word.item() - - ys = torch.cat([ys, - torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) - if next_word == eos_idx: - break - return ys - - -# actual function to translate input sentence into target language -def translate(model: torch.nn.Module, - src: torch.tensor, - tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] - - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() - - for i in range(len(tgt_tokens)): - tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -# print(tgt_tokens) - - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -if __name__ == '__main__': - main() - wandb.finish() diff --git a/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml b/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml deleted file mode 100644 index fd74d2b..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml +++ /dev/null @@ -1,158 +0,0 @@ -name: ectc -channels: - - pytorch - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - blas=1.0=mkl - - brotlipy=0.7.0=py37h27cfd23_1003 - - bzip2=1.0.8=h7b6447c_0 - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py37h06a4308_2 - - cffi=1.15.0=py37hd667e15_1 - - cryptography=36.0.0=py37h9ce1e76_0 - - cudatoolkit=11.3.1=h2bc3f7f_2 - - ffmpeg=4.3=hf484d3e_0 - - freetype=2.11.0=h70c0345_0 - - giflib=5.2.1=h7b6447c_0 - - gmp=6.2.1=h2531618_2 - - gnutls=3.6.15=he1e5248_0 - - idna=3.3=pyhd3eb1b0_0 - - intel-openmp=2021.4.0=h06a4308_3561 - - jpeg=9d=h7f8727e_0 - - lame=3.100=h7b6447c_0 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libiconv=1.15=h63c8f33_5 - - libidn2=2.3.2=h7f8727e_0 - - libpng=1.6.37=hbc83047_0 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - libtasn1=4.16.0=h27cfd23_0 - - libtiff=4.2.0=h85742a9_0 - - libunistring=0.9.10=h27cfd23_0 - - libuv=1.40.0=h7b6447c_0 - - libwebp=1.2.2=h55f646e_0 - - libwebp-base=1.2.2=h7f8727e_0 - - lz4-c=1.9.3=h295c915_1 - - mkl=2021.4.0=h06a4308_640 - - mkl-service=2.4.0=py37h7f8727e_0 - - mkl_fft=1.3.1=py37hd3c417c_0 - - mkl_random=1.2.2=py37h51133e4_0 - - ncurses=6.3=h7f8727e_2 - - nettle=3.7.3=hbbd107a_1 - - numpy-base=1.21.2=py37h79a1101_0 - - openh264=2.1.1=h4ff587b_0 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.2=py37h06a4308_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyopenssl=22.0.0=pyhd3eb1b0_0 - - pysocks=1.7.1=py37_1 - - python=3.7.11=h12debd9_0 - - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 - - pytorch-mutex=1.0=cuda - - readline=8.1.2=h7f8727e_1 - - requests=2.27.1=pyhd3eb1b0_0 - - setuptools=58.0.4=py37h06a4308_0 - - six=1.16.0=pyhd3eb1b0_1 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - torchaudio=0.11.0=py37_cu113 - - typing_extensions=4.1.1=pyh06a4308_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 - - zstd=1.4.9=haebb681_0 - - pip: - - aiohttp==3.8.1 - - aiosignal==1.2.0 - - antlr4-python3-runtime==4.8 - - async-timeout==4.0.2 - - asynctest==0.13.0 - - attrs==21.4.0 - - backcall==0.2.0 - - bitarray==2.4.1 - - blessings==1.7 - - charset-normalizer==2.0.12 - - click==8.0.4 - - colorama==0.4.4 - - configparser==5.2.0 - - cython==0.29.28 - - datasets==1.16.1 - - debugpy==1.6.0 - - decorator==5.1.1 - - dill==0.3.4 - - docker-pycreds==0.4.0 - - entrypoints==0.4 - - fastbpe==0.1.0 - - filelock==3.6.0 - - frozenlist==1.3.0 - - fsspec==2022.2.0 - - gitdb==4.0.9 - - gitpython==3.1.27 - - gpustat==0.6.0 - - huggingface-hub==0.4.0 - - hydra-core==1.0.7 - - importlib-metadata==4.11.3 - - importlib-resources==5.6.0 - - ipykernel==6.12.1 - - ipython==7.32.0 - - jedi==0.18.1 - - joblib==1.1.0 - - jupyter-client==7.2.2 - - jupyter-core==4.9.2 - - matplotlib-inline==0.1.3 - - mock==4.0.3 - - multidict==6.0.2 - - multiprocess==0.70.12.2 - - nest-asyncio==1.5.5 - - numpy==1.21.5 - - nvidia-ml-py3==7.352.0 - - omegaconf==2.0.6 - - packaging==21.3 - - pandas==1.3.5 - - parso==0.8.3 - - pathtools==0.1.2 - - pexpect==4.8.0 - - pickleshare==0.7.5 - - pillow==9.0.1 - - portalocker==2.4.0 - - promise==2.3 - - prompt-toolkit==3.0.29 - - protobuf==3.19.4 - - psutil==5.9.0 - - ptyprocess==0.7.0 - - pyarrow==7.0.0 - - pygments==2.11.2 - - pyparsing==3.0.7 - - python-dateutil==2.8.2 - - pytz==2022.1 - - pyyaml==6.0 - - pyzmq==22.3.0 - - regex==2022.3.15 - - sacrebleu==2.0.0 - - sacremoses==0.0.49 - - sentry-sdk==1.5.8 - - shortuuid==1.0.8 - - smmap==5.0.0 - - subprocess32==3.5.4 - - subword-nmt==0.3.8 - - tabulate==0.8.9 - - tokenizers==0.10.3 - - torch==1.11.0 - - torchtext==0.12.0 - - torchvision==0.9.1 - - tornado==6.1 - - tqdm==4.63.1 - - traitlets==5.1.1 - - transformers==4.14.1 - - urllib3==1.26.9 - - wandb==0.10.31 - - wcwidth==0.2.5 - - xxhash==3.0.0 - - yarl==1.7.2 - - zipp==3.7.0 -prefix: /home/ivlabs/miniconda3/envs/ectc diff --git a/wandb/run-20220416_014323-1a0lobwa/files/config.yaml b/wandb/run-20220416_014323-1a0lobwa/files/config.yaml deleted file mode 100644 index 52b4100..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/config.yaml +++ /dev/null @@ -1,110 +0,0 @@ -wandb_version: 1 - -_wandb: - desc: null - value: - cli_version: 0.10.31 - code_path: code/train_translation.py - framework: huggingface - huggingface_version: 4.14.1 - is_jupyter_run: false - is_kaggle_kernel: false - python_version: 3.7.11 - t: - 1: - - 1 - - 11 - 4: 3.7.11 - 5: 0.10.31 - 6: 4.14.1 - 8: - - 8 -batch_size: - desc: null - value: 16 -betas: - desc: null - value: - - 0.9 - - 0.98 -checkbleu: - desc: null - value: 5 -checkpoint_dir: - desc: null - value: checkpoint -clip: - desc: null - value: 1 -dfeedforward: - desc: null - value: 200 -dist_url: - desc: null - value: tcp://localhost:58472 -dmodel: - desc: null - value: 768 -dropout: - desc: null - value: 0.01 -epochs: - desc: null - value: 10 -eps: - desc: null - value: 1.0e-09 -learning_rate: - desc: null - value: 0.2 -load: - desc: null - value: 0 -loss_fn: - desc: null - value: cross_entropy -mbert_out_size: - desc: null - value: 768 -momentum: - desc: null - value: 0.9 -ngpus_per_node: - desc: null - value: 2 -nhead: - desc: null - value: 4 -nlayers: - desc: null - value: 3 -optimizer: - desc: null - value: adam -print_freq: - desc: null - value: 5 -projector: - desc: null - value: 768-256 -rank: - desc: null - value: 0 -test_translation: - desc: null - value: 0 -tokenizer: - desc: null - value: bert-base-multilingual-uncased -train: - desc: null - value: true -weight_decay: - desc: null - value: 1.0e-06 -workers: - desc: null - value: 4 -world_size: - desc: null - value: 2 diff --git a/wandb/run-20220416_014323-1a0lobwa/files/diff.patch b/wandb/run-20220416_014323-1a0lobwa/files/diff.patch deleted file mode 100644 index 5f2c089..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/diff.patch +++ /dev/null @@ -1,30817 +0,0 @@ -diff --git a/__pycache__/barlow_utils.cpython-37.pyc b/__pycache__/barlow_utils.cpython-37.pyc -index 3c0d4fe..b13b62f 100644 -Binary files a/__pycache__/barlow_utils.cpython-37.pyc and b/__pycache__/barlow_utils.cpython-37.pyc differ -diff --git a/__pycache__/models.cpython-37.pyc b/__pycache__/models.cpython-37.pyc -index 3bbb9de..acc1737 100644 -Binary files a/__pycache__/models.cpython-37.pyc and b/__pycache__/models.cpython-37.pyc differ -diff --git a/__pycache__/t_dataset.cpython-37.pyc b/__pycache__/t_dataset.cpython-37.pyc -index 2650733..c4b566b 100644 -Binary files a/__pycache__/t_dataset.cpython-37.pyc and b/__pycache__/t_dataset.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-37.pyc b/__pycache__/translation_utils.cpython-37.pyc -index 60c9eda..12c22a5 100644 -Binary files a/__pycache__/translation_utils.cpython-37.pyc and b/__pycache__/translation_utils.cpython-37.pyc differ -diff --git a/__pycache__/translation_utils.cpython-38.pyc b/__pycache__/translation_utils.cpython-38.pyc -index 061d0e7..a1e7877 100644 -Binary files a/__pycache__/translation_utils.cpython-38.pyc and b/__pycache__/translation_utils.cpython-38.pyc differ -diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt -index 884dd9c..83f30a6 100644 ---- a/checkpoint/stats.txt -+++ b/checkpoint/stats.txt -@@ -833,3 +833,202 @@ train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 - - {"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} - {"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} - {"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 4} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 5} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 5} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 6} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 7} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 7} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 8} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 8} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 9} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 8} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 15} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 183} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 239} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 295} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 351} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 407} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 463} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 19} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 104} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 355} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 606} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.121065616607666, "time": 9} -+{"epoch": 0, "step": 5, "loss": 97.44178771972656, "time": 10} -+{"epoch": 0, "step": 10, "loss": 168.33328247070312, "time": 12} -+{"epoch": 0, "step": 15, "loss": 133.17933654785156, "time": 12} -+{"epoch": 0, "step": 20, "loss": 112.3768539428711, "time": 13} -+{"epoch": 0, "step": 25, "loss": 120.29653930664062, "time": 14} -+{"epoch": 0, "step": 30, "loss": 119.97941589355469, "time": 15} -+{"epoch": 0, "step": 35, "loss": 86.40515899658203, "time": 16} -+{"epoch": 0, "step": 40, "loss": 70.5906982421875, "time": 17} -+train_translation.py -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 28} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 155} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 281} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 405} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 530} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 657} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 783} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 908} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 1033} -+train_translation.py -+train_translation.py -+train_translation.py -+train_translation.py --load=1 -+train_translation.py --load=1 -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 65} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 178} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 9} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 37} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 66} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 94} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 122} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 150} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 179} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 207} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 235} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 16} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 72} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 128} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 184} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 240} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 296} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 352} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 408} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 464} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 692} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 20} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 188} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 272} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 356} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 439} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 523} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 607} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 690} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 105} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 440} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.079373359680176, "time": 21} -+{"epoch": 0, "step": 5, "loss": 100.0801773071289, "time": 106} -+{"epoch": 0, "step": 10, "loss": 157.312744140625, "time": 189} -+{"epoch": 0, "step": 15, "loss": 78.03355407714844, "time": 273} -+{"epoch": 0, "step": 20, "loss": 85.30223083496094, "time": 357} -+{"epoch": 0, "step": 25, "loss": 79.75176239013672, "time": 441} -+{"epoch": 0, "step": 30, "loss": 123.69627380371094, "time": 524} -+{"epoch": 0, "step": 35, "loss": 70.34227752685547, "time": 608} -+{"epoch": 0, "step": 40, "loss": 108.36054229736328, "time": 691} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 19} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 104} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 188} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 70} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 70} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 116} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 116} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 117} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 164} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 165} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 182} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 183} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 6} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 7} -+{"epoch": 1, "step": 15, "loss": 138.67442321777344, "time": 73} -+{"epoch": 1, "step": 20, "loss": 75.6456298828125, "time": 74} -+{"epoch": 2, "step": 25, "loss": 64.19247436523438, "time": 92} -+{"epoch": 2, "step": 30, "loss": 65.62056732177734, "time": 93} -+{"epoch": 2, "step": 35, "loss": 66.36638641357422, "time": 93} -+{"epoch": 3, "step": 40, "loss": 77.29269409179688, "time": 110} -+{"epoch": 3, "step": 45, "loss": 68.74011993408203, "time": 111} -+{"epoch": 4, "step": 50, "loss": 74.82659912109375, "time": 131} -+{"epoch": 4, "step": 55, "loss": 77.39452362060547, "time": 132} -+{"epoch": 5, "step": 60, "loss": 62.27414321899414, "time": 149} -+{"epoch": 5, "step": 65, "loss": 90.9207992553711, "time": 150} -+{"epoch": 5, "step": 70, "loss": 66.96754455566406, "time": 150} -+{"epoch": 6, "step": 75, "loss": 71.40245819091797, "time": 216} -+{"epoch": 6, "step": 80, "loss": 63.940818786621094, "time": 217} -+{"epoch": 7, "step": 85, "loss": 50.857147216796875, "time": 233} -+{"epoch": 7, "step": 90, "loss": 78.37335205078125, "time": 234} -+{"epoch": 7, "step": 95, "loss": 100.13611602783203, "time": 234} -+{"epoch": 8, "step": 100, "loss": 80.35195922851562, "time": 252} -+{"epoch": 8, "step": 105, "loss": 86.00081634521484, "time": 253} -+{"epoch": 9, "step": 110, "loss": 82.35330200195312, "time": 272} -+{"epoch": 9, "step": 115, "loss": 88.81517791748047, "time": 273} -+train_translation.py --load=0 -+{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 5} -+{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 5} -+{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 6} -diff --git a/t_dataset.py b/t_dataset.py -index c7ab181..53d5caa 100644 ---- a/t_dataset.py -+++ b/t_dataset.py -@@ -20,19 +20,19 @@ class Translation_dataset_t(Dataset): - split = "train" - else: - split = "test" -- self.dataset = load_dataset('wmt14', "de-en", split=split) -+ self.dataset = load_dataset('opus_rf', "de-en", split=split) - self.de_list = [] - self.en_list = [] - # self.tokenizer = tokenizer - self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') -- dataset = load_dataset('opus_rf', 'de-en', split='train') - en_list_2 = [] -- for n, i in enumerate(dataset): -+ for n, i in enumerate(self.dataset): - en_list_2.append(i['translation']['en'].lower()) - - a1 = list(self.tokenizer(en_list_2, padding=True, return_tensors='pt')['input_ids']) - self.en_vocab, self.en_vocab_size = vocab(a1) - self.bert2id_dict = translation_utils.bert2id(self.en_vocab) -+ self.id2bert_dict = translation_utils.id2bert(self.en_vocab) - - for i in self.dataset: - self.de_list.append(self.tokenizer(i['translation']['de'].lower(), -diff --git a/train_translation.py b/train_translation.py -index eea074a..a5d5e46 100644 ---- a/train_translation.py -+++ b/train_translation.py -@@ -33,6 +33,7 @@ import wandb - #import barlow - os.environ['TRANSFORMERS_OFFLINE'] = 'yes' - os.environ['WANDB_START_METHOD'] = 'thread' -+os.environ['CUDA_LAUNCH_BLOCKING'] = '1' - - MANUAL_SEED = 4444 - -@@ -47,9 +48,9 @@ parser = argparse.ArgumentParser(description = 'Translation') - # Training hyper-parameters: - parser.add_argument('--workers', default=4, type=int, metavar='N', - help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -+parser.add_argument('--epochs', default=10, type=int, metavar='N', - help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -+parser.add_argument('--batch_size', default=16, type=int, metavar='n', - help='mini-batch size') - parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', - help='base learning rate') -@@ -75,9 +76,9 @@ parser.add_argument('--dmodel', default=768, type=int, metavar='T', - help='dimension of transformer encoder') - parser.add_argument('--nhead', default=4, type= int, metavar='N', - help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=500, type=int, metavar='F', -+parser.add_argument('--dfeedforward', default=200, type=int, metavar='F', - help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=8, type=int, metavar= 'N', -+parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', - help='number of layers of transformer encoder') - parser.add_argument('--projector', default='768-256', type=str, - metavar='MLP', help='projector MLP') -@@ -233,6 +234,7 @@ def main_worker(gpu, args): - - assert args.batch_size % args.world_size == 0 - per_device_batch_size = args.batch_size // args.world_size -+ id2bert_dict = dataset.id2bert_dict - ############################### - loader = torch.utils.data.DataLoader( - dataset, batch_size=per_device_batch_size, num_workers=args.workers, -@@ -267,7 +269,7 @@ def main_worker(gpu, args): - optimizer.step() - # losses += loss.item() - -- # wandb.log({'iter_loss': loss}) -+# wandb.log({'iter_loss': loss}) - epoch_loss += loss.item() - t += 1 - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -@@ -293,7 +295,7 @@ def main_worker(gpu, args): - if args.rank == 0: - if epoch%args.checkbleu ==0 : - -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -+ bleu_score = checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu) - wandb.log({'bleu_score': bleu_score}) - # print(bleu_score(predicted, target)) - ############################################################## -@@ -311,13 +313,13 @@ def main_worker(gpu, args): - - else: - -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -+ bleu_score = checkbleu(model,tokenizer, test_loader, id2bert_dict, gpu ) - print('test_bleu_score', bleu_score) - if args.rank == 0: - wandb.log({'bleu_score': bleu_score}) - - --def checkbleu(model, tokenizer, test_loader, gpu): -+def checkbleu(model, tokenizer, test_loader, id2bert_dict, gpu): - - model.eval() - predicted=[] -@@ -325,19 +327,26 @@ def checkbleu(model, tokenizer, test_loader, gpu): - - for i in test_loader: - src = i[0].cuda(gpu, non_blocking=True) -+# tgt_out = i[1][1:, : ].cuda(gpu, non_blocking=True) - tgt_out = i[3].cuda(gpu, non_blocking=True) - num_tokens = src.shape[0] - - src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -+ out = translate(model, src, tokenizer, src_mask, id2bert_dict, gpu) - predicted.append(out) -+ for i in range(len(tgt_out)): -+ tgt_out[i] = id2bert_dict[tgt_out[i].item()] - target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -+ print('out', out) -+ print('predicted', tokenizer.convert_ids_to_tokens(tgt_out)) -+ - - try: - bleu_score(predicted, target) - except: - predicted.pop() - target.pop() -+ - - bleu = bleu_score(predicted, target) - -@@ -375,7 +384,7 @@ def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): - # actual function to translate input sentence into target language - def translate(model: torch.nn.Module, - src: torch.tensor, -- tokenizer,src_mask, gpu): -+ tokenizer,src_mask, id2bert_dict, gpu): - model.eval() - - num_tokens = src.shape[0] -@@ -383,6 +392,11 @@ def translate(model: torch.nn.Module, - - tgt_tokens = greedy_decode( - model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -+ -+ for i in range(len(tgt_tokens)): -+ tgt_tokens[i] = id2bert_dict[tgt_tokens[i].item()] -+# print(tgt_tokens) -+ - return tokenizer.convert_ids_to_tokens(tgt_tokens) - - -diff --git a/translation_dataset.py b/translation_dataset.py -index 274c2f3..82270c6 100644 ---- a/translation_dataset.py -+++ b/translation_dataset.py -@@ -11,7 +11,7 @@ class Translation_dataset(Dataset): - - def __init__(self): - -- self.dataset = load_dataset('wmt14', "de-en", split="train") -+ self.dataset = load_dataset('opus_rf', "de-en", split="train") - self.de_list = [] - self.en_list = [] - -diff --git a/translation_utils.py b/translation_utils.py -index 6c66f53..4b3b830 100644 ---- a/translation_utils.py -+++ b/translation_utils.py -@@ -31,6 +31,13 @@ def bert2id(de_list: set): - - return label_dict - -+def id2bert(de_list: set): -+ label_dict = {} -+ for n, i in enumerate(de_list): -+ label_dict[n] = i -+ -+ return label_dict -+ - def generate_square_subsequent_mask(sz): - mask = (torch.triu(torch.ones((sz, sz))) == 1).transpose(0, 1) - mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) -@@ -81,10 +88,10 @@ class TokenEmbedding(nn.Module): - super(TokenEmbedding, self).__init__() - # self.embedding = nn.Embedding(vocab_size, emb_size) - self.embedding = mbert --# for param in self.embedding.parameters(): --# param.requires_grad = False --# for param in self.embedding.pooler.parameters(): --# param.requires_grad = True -+ for param in self.embedding.parameters(): -+ param.requires_grad = False -+ for param in self.embedding.pooler.parameters(): -+ param.requires_grad = True - self.emb_size = emb_size - - def forward(self, tokens: torch.tensor): -diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log -index 6163657..5c95722 120000 ---- a/wandb/debug-internal.log -+++ b/wandb/debug-internal.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug-internal.log -\ No newline at end of file -+run-20220416_014323-1a0lobwa/logs/debug-internal.log -\ No newline at end of file -diff --git a/wandb/debug.log b/wandb/debug.log -index 7d0f5dd..c54d1ec 120000 ---- a/wandb/debug.log -+++ b/wandb/debug.log -@@ -1 +1 @@ --run-20220409_182749-paufev36/logs/debug.log -\ No newline at end of file -+run-20220416_014323-1a0lobwa/logs/debug.log -\ No newline at end of file -diff --git a/wandb/latest-run b/wandb/latest-run -index f11d588..34b339f 120000 ---- a/wandb/latest-run -+++ b/wandb/latest-run -@@ -1 +1 @@ --run-20220409_182749-paufev36 -\ No newline at end of file -+run-20220416_014323-1a0lobwa -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py b/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -deleted file mode 100644 -index 9236ace..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py -+++ /dev/null -@@ -1,350 +0,0 @@ --# Copyright (c) Facebook, Inc. and its affiliates. --# All rights reserved. --# --# This source code is licensed under the license found in the --# LICENSE file in the root directory of this source tree. -- --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time --from translation_dataset import Translation_dataset --from translation_dataset import MyCollate --from transformers import BertModel --from transformers import AutoTokenizer --from torch import nn, optim --import torch --from t_dataset import Translation_dataset_t --from torch.nn import Transformer --from models import BarlowTwins --from models import Translator --from barlow_utils import off_diagonal --import wandb --#from _config import Config --#config = Config.config -- --os.environ['WANDB_START_METHOD'] = 'thread' -- --#setting random seeds --SEED = 4444 -- --random.seed(SEED) --np.random.seed(SEED) --torch.manual_seed(SEED) --torch.cuda.manual_seed(SEED) --torch.backends.cudnn.deterministic = True -- -- -- -- --parser = argparse.ArgumentParser(description='Barlow Twins Training') --# parser.add_batch_sizeargument('data', type=Path, metavar='DIR', --# help='path to dataset') -- -- -- --# Training parameters: --parser.add_argument('--workers', default=20, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=2, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=64, type=int, metavar='N', -- help='mini-batch size') --parser.add_argument('--learning-rate-weights', default=0.2, type=float, metavar='LR', -- help='base learning rate for weights') --parser.add_argument('--learning-rate-biases', default=0.0048, type=float, metavar='LR', -- help='base learning rate for biases and batch norm parameters') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--lambd', default=0.0051, type=float, metavar='L', -- help='weight on off-diagonal terms') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') -- --# Model parameters: --parser.add_argument('--projector', default='768-768', type=str, -- metavar='MLP', help='projector MLP') --parser.add_argument('--print-freq', default=100, type=int, metavar='N', -- help='print frequency') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=3, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--dropout', default=0.0051, type=float, metavar= 'D', -- help='dropout in transformer') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-cased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint-dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') --parser.add_argument('--load', default=1, type=int, -- metavar='LO', help='load weights from translation model') -- --args = parser.parse_args() -- --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- wandb.init(config=args)############################################# -- # wandb.config.update(args) -- config = wandb.config -- # print(args.lambd, config.lambd) -- # wandb.finish() -- # exibatch_sizet() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=False) -- t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- mbert = BertModel.from_pretrained(args.tokenizer) -- model = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=args.lambd).cuda(gpu) -- model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- optimizer = LARS(parameters, lr=0, weight_decay=args.weight_decay, -- weight_decay_filter=True, -- lars_adaptation_filter=True) -- # optimizer = torch.optim.Adam(model.parameters(),lr=0.001) -- -- # automatically resume from checkpoint if it exists -- # if (args.checkpoint_dir / 'checkpoint.pth').is_file(): -- # ckpt = torch.load(args.checkpoint_dir / 'checkpoint.pth', -- # map_location='cpu') -- # start_epoch = ckpt['epoch'] -- # # print("model=",model) -- # # print("ckpt=",ckpt['model']) -- # model.load_state_dict(ckpt['model']) -- # optimizer.load_state_dict(ckpt['optimizer']) -- # else: -- -- trans_dataset = Translation_dataset_t(train=True) -- src_vocab_size = trans_dataset.de_vocab_size -- tgt_vocab_size = trans_dataset.en_vocab_size -- tokenizer = trans_dataset.tokenizer -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers=args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- print(args.batch_size) -- translation_model = Translator(mbert, -- transformer, -- tgt_vocab_size=tgt_vocab_size, -- emb_size=args.mbert_out_size) -- -- if args.load == 1 : -- print('loading translation model') -- ckpt = torch.load(args.checkpoint_dir / 'translation_checkpoint.pth') #,map_location='cpu') -- translation_model.load_state_dict(ckpt['model']) -- model.transformer_enc = translation_model.transformer.encoder -- model.mbert = translation_model.tok_emb.embedding -- -- start_epoch = 0 -- -- -- ################################ -- # dataset = torchvision.datasets.ImageFolder(args.data / 'train', Transform()) -- # sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- ############################### -- -- dataset = Translation_dataset() -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate()) -- ############################# -- start_time = time.time() -- scaler = torch.cuda.amp.GradScaler() -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- adjust_learning_rate(args, optimizer, loader, step) -- optimizer.zero_grad() -- with torch.cuda.amp.autocast(): -- _, loss = model.forward(y1, y2) -- wandb.log({'iter_loss':loss}) --# print(loss.item()) -- epoch_loss += loss.item() -- scaler.scale(loss).backward() -- torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) -- scaler.step(optimizer) -- scaler.update() -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- lr_weights=optimizer.param_groups[0]['lr'], -- lr_biases=optimizer.param_groups[1]['lr'], -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.state_dict(), -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) -- for sent in test_loader: -- y1 = sent[0].cuda(gpu, non_blocking=True) -- y2 = sent[1].cuda(gpu, non_blocking=True) -- model.eval() -- c, _ = model(y1, y2) -- xlabels = tokenizer.convert_ids_to_tokens(y2) -- ylabels = tokenizer.convert_ids_to_tokens(y1) -- wandb.finish() --# if args.rank == 0: --# save final model --# torch.save(model.module.state_dict(), --# args.checkpoint_dir / 'translation.pth') -- -- --def adjust_learning_rate(args, optimizer, loader, step): -- max_steps = args.epochs * len(loader) -- warmup_steps = 10 * len(loader) -- base_lr = args.batch_size / 256 -- if step < warmup_steps: -- lr = base_lr * step / warmup_steps -- else: -- step -= warmup_steps -- max_steps -= warmup_steps -- q = 0.5 * (1 + math.cos(math.pi * step / max_steps)) -- end_lr = base_lr * 0.001 -- lr = base_lr * q + end_lr * (1 - q) -- optimizer.param_groups[0]['lr'] = lr * args.learning_rate_weights -- optimizer.param_groups[1]['lr'] = lr * args.learning_rate_biases -- -- --def handle_sigusr1(signum, frame): -- os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') -- exit() -- -- --def handle_sigterm(signum, frame): -- pass -- -- --class LARS(optim.Optimizer): -- def __init__(self, params, lr, weight_decay=0, momentum=0.9, eta=0.001, -- weight_decay_filter=False, lars_adaptation_filter=False): -- defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum, -- eta=eta, weight_decay_filter=weight_decay_filter, -- lars_adaptation_filter=lars_adaptation_filter) -- super().__init__(params, defaults) -- -- -- def exclude_bias_and_norm(self, p): -- return p.ndim == 1 -- -- @torch.no_grad() -- def step(self): -- for g in self.param_groups: -- for p in g['params']: -- dp = p.grad -- -- if dp is None: -- continue -- -- if not g['weight_decay_filter'] or not self.exclude_bias_and_norm(p): -- dp = dp.add(p, alpha=g['weight_decay']) -- -- if not g['lars_adaptation_filter'] or not self.exclude_bias_and_norm(p): -- param_norm = torch.norm(p) -- update_norm = torch.norm(dp) -- one = torch.ones_like(param_norm) -- q = torch.where(param_norm > 0., -- torch.where(update_norm > 0, -- (g['eta'] * param_norm / update_norm), one), one) -- dp = dp.mul(q) -- -- param_state = self.state[p] -- if 'mu' not in param_state: -- param_state['mu'] = torch.zeros_like(p) -- mu = param_state['mu'] -- mu.mul_(g['momentum']).add_(dp) -- -- p.add_(mu, alpha=-g['lr']) -- -- --if __name__ == '__main__': -- try: -- main() -- except KeyboardInterrupt: -- print('Interrupted') -- wandb.finish() -- try: -- sys.exit(0) -- except SystemExit: -- os._exit(0) -diff --git a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml b/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220406_171518-s7zesus8/files/config.yaml b/wandb/run-20220406_171518-s7zesus8/files/config.yaml -deleted file mode 100644 -index 147470d..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/config.yaml -+++ /dev/null -@@ -1,90 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/barlow.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.0051 --epochs: -- desc: null -- value: 2 --lambd: -- desc: null -- value: 0.0051 --learning_rate_biases: -- desc: null -- value: 0.0048 --learning_rate_weights: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 3 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 100 --projector: -- desc: null -- value: 768-768 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-cased --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 20 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220406_171518-s7zesus8/files/output.log b/wandb/run-20220406_171518-s7zesus8/files/output.log -deleted file mode 100644 -index 847ffbb..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/output.log -+++ /dev/null -@@ -1,74 +0,0 @@ -- --barlow.py --load 0 --Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Error in sys.excepthook: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 47, in getlines -- return updatecache(filename, module_globals) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/linecache.py", line 136, in updatecache -- with tokenize.open(fullname) as fp: -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/tokenize.py", line 447, in open -- buffer = _builtin_open(filename, 'rb') --KeyboardInterrupt --Original exception was: --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt b/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -deleted file mode 100644 -index 5f93d29..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json -+++ /dev/null -@@ -1,21 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-06T11:45:20.215162", -- "startedAt": "2022-04-06T11:45:18.613420", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_enhancement/barlow.py", -- "codePath": "barlow.py", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json b/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log b/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -deleted file mode 100644 -index 0630656..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log -+++ /dev/null -@@ -1,91 +0,0 @@ --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,622 DEBUG MainThread:16786 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: check_version --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send():179] send: header --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:18,626 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: check_version --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:19,155 DEBUG SenderThread:16786 [sender.py:send():179] send: run --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:19,158 DEBUG SenderThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 DEBUG SenderThread:16786 [sender.py:send():179] send: summary --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:20,211 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: run_start --2022-04-06 17:15:20,214 DEBUG HandlerThread:16786 [meta.py:__init__():39] meta init --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:__init__():53] meta init done --2022-04-06 17:15:20,215 DEBUG HandlerThread:16786 [meta.py:probe():210] probe --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [git.py:repo():33] git repository is invalid --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():89] save code --2022-04-06 17:15:20,220 DEBUG HandlerThread:16786 [meta.py:_save_code():110] save code done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():57] save pip --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_pip():71] save pip done --2022-04-06 17:15:20,221 DEBUG HandlerThread:16786 [meta.py:_save_conda():78] save conda --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,240 DEBUG HandlerThread:16786 [meta.py:_save_conda():86] save conda done --2022-04-06 17:15:22,241 DEBUG HandlerThread:16786 [meta.py:probe():252] probe done --2022-04-06 17:15:22,255 DEBUG SenderThread:16786 [sender.py:send():179] send: files --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 DEBUG HandlerThread:16786 [handler.py:handle_request():124] handle_request: stop_status --2022-04-06 17:15:22,262 DEBUG SenderThread:16786 [sender.py:send_request():193] send_request: stop_status --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/logs/debug.log b/wandb/run-20220406_171518-s7zesus8/logs/debug.log -deleted file mode 100644 -index 9769176..0000000 ---- a/wandb/run-20220406_171518-s7zesus8/logs/debug.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/logs/debug-internal.log --2022-04-06 17:15:18,614 INFO MainThread:16786 [wandb_init.py:init():369] calling init triggers --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 20, 'epochs': 2, 'batch_size': 64, 'learning_rate_weights': 0.2, 'learning_rate_biases': 0.0048, 'weight_decay': 1e-06, 'lambd': 0.0051, 'clip': 1, 'projector': '768-768', 'print_freq': 100, 'dmodel': 768, 'nhead': 3, 'dfeedforward': 256, 'nlayers': 3, 'dropout': 0.0051, 'tokenizer': 'bert-base-multilingual-cased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-06 17:15:18,615 INFO MainThread:16786 [wandb_init.py:init():418] starting backend --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():132] starting backend process... --2022-04-06 17:15:18,619 INFO MainThread:16786 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-06 17:15:18,620 INFO wandb_internal:16786 [internal.py:wandb_internal():91] W&B internal server running at pid: 16786, started at: 2022-04-06 17:15:18.619828 --2022-04-06 17:15:18,620 INFO MainThread:16786 [wandb_init.py:init():423] backend started and connected --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():465] updated telemetry --2022-04-06 17:15:18,625 INFO MainThread:16786 [wandb_init.py:init():484] communicating current version --2022-04-06 17:15:18,626 INFO WriterThread:16786 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.12 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-06 17:15:19,154 INFO MainThread:16786 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-06 17:15:20,208 INFO SenderThread:16786 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:20,208 INFO SenderThread:16786 [sender.py:_start_run_threads():707] run started: s7zesus8 with start time 1649245518 --2022-04-06 17:15:20,210 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-06 17:15:20,211 INFO MainThread:16786 [wandb_init.py:init():522] starting run threads in backend --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:21,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code --2022-04-06 17:15:22,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:22,255 INFO SenderThread:16786 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-06 17:15:22,256 INFO SenderThread:16786 [sender.py:_save_file():829] saving file code/barlow.py with policy now --2022-04-06 17:15:22,261 INFO MainThread:16786 [wandb_run.py:_console_start():1538] atexit reg --2022-04-06 17:15:22,262 INFO MainThread:16786 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-06 17:15:22,264 INFO MainThread:16786 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-06 17:15:22,266 INFO MainThread:16786 [wandb_init.py:init():547] run started, returning control to user process --2022-04-06 17:15:23,209 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:23,210 INFO Thread-11 :16786 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json --2022-04-06 17:15:23,555 INFO Thread-14 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/2ggqgylt-wandb-metadata.json --2022-04-06 17:15:23,635 INFO Thread-17 :16786 [upload_job.py:push():133] Uploaded file /tmp/tmp8udrbs4mwandb/56j3ha1n-code/barlow.py --2022-04-06 17:15:25,349 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:28,351 INFO Thread-11 :16786 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:29,273 INFO SenderThread:16786 [sender.py:finish():933] shutting down sender --2022-04-06 17:15:29,273 INFO WriterThread:16786 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb --2022-04-06 17:15:29,273 INFO SenderThread:16786 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt requirements.txt --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-metadata.json wandb-metadata.json --2022-04-06 17:15:29,351 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log output.log --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml conda-environment.yaml --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json wandb-summary.json --2022-04-06 17:15:29,352 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml config.yaml --2022-04-06 17:15:29,354 INFO SenderThread:16786 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/code/barlow.py code/barlow.py --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:finish():176] shutting down file pusher --2022-04-06 17:15:29,354 INFO SenderThread:16786 [file_pusher.py:join():181] waiting for file pusher --2022-04-06 17:15:30,676 INFO Thread-23 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/output.log --2022-04-06 17:15:30,684 INFO Thread-26 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/config.yaml --2022-04-06 17:15:30,686 INFO Thread-22 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/requirements.txt --2022-04-06 17:15:30,694 INFO Thread-24 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/conda-environment.yaml --2022-04-06 17:15:30,730 INFO Thread-25 :16786 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_enhancement/wandb/run-20220406_171518-s7zesus8/files/wandb-summary.json --2022-04-06 17:15:31,674 ERROR wandb_internal:16786 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-06 17:17:48,865 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,946 INFO MainThread:16786 [wandb_run.py:_restore():1480] restore --2022-04-06 17:17:48,947 INFO MainThread:16786 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb b/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb -deleted file mode 100644 -index cd7ebea..0000000 -Binary files a/wandb/run-20220406_171518-s7zesus8/run-s7zesus8.wandb and /dev/null differ -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py b/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml b/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -deleted file mode 100644 -index f15df21..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch b/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -deleted file mode 100644 -index 0ddeae0..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch -+++ /dev/null -@@ -1,226 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2158287 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,87 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..ee4c0ff 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..29be718 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145845-d3rkwo1k/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..bda663d 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145845-d3rkwo1k --\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/output.log b/wandb/run-20220408_145845-d3rkwo1k/files/output.log -deleted file mode 100644 -index 4d74c7d..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt b/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -deleted file mode 100644 -index 9eb0f02..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:28:48.101605", -- "startedAt": "2022-04-08T09:28:45.736549", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json b/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -deleted file mode 100644 -index 5708b15..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.139744758605957, "_runtime": 22, "_timestamp": 1649410147, "_step": 1, "epoch_loss": 7.139744758605957} -\ No newline at end of file -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -deleted file mode 100644 -index e57e276..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log -+++ /dev/null -@@ -1,74 +0,0 @@ --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,745 DEBUG MainThread:63630 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send():179] send: header --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:45,753 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:46,531 DEBUG SenderThread:63630 [sender.py:send():179] send: run --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:48,099 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():39] meta init --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:__init__():53] meta init done --2022-04-08 14:58:48,101 DEBUG HandlerThread:63630 [meta.py:probe():210] probe --2022-04-08 14:58:48,107 DEBUG HandlerThread:63630 [meta.py:_setup_git():200] setup git --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_setup_git():207] setup git done --2022-04-08 14:58:48,124 DEBUG HandlerThread:63630 [meta.py:_save_code():89] save code --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_code():110] save code done --2022-04-08 14:58:48,132 DEBUG HandlerThread:63630 [meta.py:_save_patches():127] save patches --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_patches():169] save patches done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():57] save pip --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_pip():71] save pip done --2022-04-08 14:58:48,182 DEBUG HandlerThread:63630 [meta.py:_save_conda():78] save conda --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:_save_conda():86] save conda done --2022-04-08 14:58:49,720 DEBUG HandlerThread:63630 [meta.py:probe():252] probe done --2022-04-08 14:58:49,727 DEBUG SenderThread:63630 [sender.py:send():179] send: files --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,737 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:50,547 DEBUG SenderThread:63630 [sender.py:send():179] send: config --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:05,549 DEBUG HandlerThread:63630 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:05,549 DEBUG SenderThread:63630 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:06,836 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: history --2022-04-08 14:59:07,365 DEBUG SenderThread:63630 [sender.py:send():179] send: summary --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log b/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -deleted file mode 100644 -index a6875c4..0000000 ---- a/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log -+++ /dev/null -@@ -1,52 +0,0 @@ --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'd3rkwo1k', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml', 'start_method': 'thread'} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/logs/debug-internal.log --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:58:45,737 INFO MainThread:63630 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:45,738 INFO MainThread:63630 [wandb_init.py:init():418] starting backend --2022-04-08 14:58:45,743 INFO MainThread:63630 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:58:45,744 INFO wandb_internal:63630 [internal.py:wandb_internal():91] W&B internal server running at pid: 63630, started at: 2022-04-08 14:58:45.743405 --2022-04-08 14:58:45,744 INFO MainThread:63630 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:58:45,745 INFO MainThread:63630 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:58:45,746 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 512, 'epochs': 32, 'nhead': 6, 'nlayers': 4} --2022-04-08 14:58:45,748 INFO MainThread:63630 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:58:45,749 INFO MainThread:63630 [wandb_init.py:init():484] communicating current version --2022-04-08 14:58:45,753 INFO WriterThread:63630 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:58:46,531 INFO MainThread:63630 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:58:48,098 INFO SenderThread:63630 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files --2022-04-08 14:58:48,098 INFO SenderThread:63630 [sender.py:_start_run_threads():707] run started: d3rkwo1k with start time 1649410125 --2022-04-08 14:58:48,098 INFO MainThread:63630 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:58:48,099 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code/train_translation.py --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/diff.patch --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:58:49,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/requirements.txt --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:49,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/code --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:58:49,727 INFO SenderThread:63630 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:58:49,728 INFO SenderThread:63630 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:58:49,737 INFO MainThread:63630 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:58:49,739 INFO MainThread:63630 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:58:49,741 INFO MainThread:63630 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/conda-environment.yaml --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:50,098 INFO Thread-11 :63630 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-metadata.json --2022-04-08 14:58:52,067 INFO Thread-14 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2ocynek4-wandb-metadata.json --2022-04-08 14:58:52,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:52,358 INFO Thread-15 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2gxjwsey-code/train_translation.py --2022-04-08 14:58:52,358 INFO Thread-16 :63630 [upload_job.py:push():133] Uploaded file /tmp/tmpgr3njy6lwandb/2au0uu9d-diff.patch --2022-04-08 14:58:54,099 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/config.yaml --2022-04-08 14:58:56,100 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:58:58,133 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:00,168 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/output.log --2022-04-08 14:59:06,838 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:07,169 INFO Thread-11 :63630 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145845-d3rkwo1k/files/wandb-summary.json --2022-04-08 14:59:07,365 INFO SenderThread:63630 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb b/wandb/run-20220408_145845-d3rkwo1k/run-d3rkwo1k.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py b/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml b/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145917-fjhaj183/files/config.yaml b/wandb/run-20220408_145917-fjhaj183/files/config.yaml -deleted file mode 100644 -index d5b49b7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 36 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145917-fjhaj183/files/diff.patch b/wandb/run-20220408_145917-fjhaj183/files/diff.patch -deleted file mode 100644 -index 5bddede..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/diff.patch -+++ /dev/null -@@ -1,228 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..f7a973d 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,89 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..151b958 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..80b3468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145917-fjhaj183/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..abf5aa3 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145917-fjhaj183 --\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/output.log b/wandb/run-20220408_145917-fjhaj183/files/output.log -deleted file mode 100644 -index ceeeb4b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -diff --git a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt b/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -deleted file mode 100644 -index 705a1e7..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:18.659644", -- "startedAt": "2022-04-08T09:29:17.328450", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=36", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json b/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -deleted file mode 100644 -index 1749cae..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140841484069824, "_runtime": 16, "_timestamp": 1649410173, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log b/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -deleted file mode 100644 -index 6a2ea0b..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,338 DEBUG MainThread:63880 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send():179] send: header --2022-04-08 14:59:17,342 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,342 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:17,943 DEBUG SenderThread:63880 [sender.py:send():179] send: run --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:18,657 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():39] meta init --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:__init__():53] meta init done --2022-04-08 14:59:18,659 DEBUG HandlerThread:63880 [meta.py:probe():210] probe --2022-04-08 14:59:18,665 DEBUG HandlerThread:63880 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:18,685 DEBUG HandlerThread:63880 [meta.py:_save_code():89] save code --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_code():110] save code done --2022-04-08 14:59:18,694 DEBUG HandlerThread:63880 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:18,749 DEBUG HandlerThread:63880 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:18,750 DEBUG HandlerThread:63880 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:20,073 DEBUG HandlerThread:63880 [meta.py:probe():252] probe done --2022-04-08 14:59:20,075 DEBUG SenderThread:63880 [sender.py:send():179] send: files --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 DEBUG HandlerThread:63880 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,086 DEBUG SenderThread:63880 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:20,978 DEBUG SenderThread:63880 [sender.py:send():179] send: config --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: history --2022-04-08 14:59:33,642 DEBUG SenderThread:63880 [sender.py:send():179] send: summary --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/logs/debug.log b/wandb/run-20220408_145917-fjhaj183/logs/debug.log -deleted file mode 100644 -index 5f71fa1..0000000 ---- a/wandb/run-20220408_145917-fjhaj183/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjhaj183', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjhaj183.yaml', 'start_method': 'thread'} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/logs/debug-internal.log --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 36, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:17,329 INFO MainThread:63880 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:17,335 INFO MainThread:63880 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:17,336 INFO wandb_internal:63880 [internal.py:wandb_internal():91] W&B internal server running at pid: 63880, started at: 2022-04-08 14:59:17.335830 --2022-04-08 14:59:17,336 INFO MainThread:63880 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:17,338 INFO MainThread:63880 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:17,339 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 36, 'nhead': 4, 'nlayers': 4} --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:17,341 INFO MainThread:63880 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:17,342 INFO WriterThread:63880 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:17,942 INFO MainThread:63880 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:18,597 INFO MainThread:63880 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:18,657 INFO SenderThread:63880 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_start_run_threads():707] run started: fjhaj183 with start time 1649410157 --2022-04-08 14:59:18,657 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/diff.patch --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code/train_translation.py --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/requirements.txt --2022-04-08 14:59:19,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json --2022-04-08 14:59:19,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/code --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:20,075 INFO SenderThread:63880 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:20,076 INFO SenderThread:63880 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:20,085 INFO MainThread:63880 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:20,087 INFO MainThread:63880 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:20,088 INFO MainThread:63880 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:20,089 INFO MainThread:63880 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:20,657 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/conda-environment.yaml --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-metadata.json --2022-04-08 14:59:20,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:22,011 INFO Thread-14 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/jylptjcp-wandb-metadata.json --2022-04-08 14:59:22,139 INFO Thread-16 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/1pe5aukq-diff.patch --2022-04-08 14:59:22,375 INFO Thread-15 :63880 [upload_job.py:push():133] Uploaded file /tmp/tmp9_iiwlg8wandb/20nxn48w-code/train_translation.py --2022-04-08 14:59:22,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:23,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/config.yaml --2022-04-08 14:59:24,658 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:26,659 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/output.log --2022-04-08 14:59:33,644 INFO SenderThread:63880 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:33,718 INFO Thread-11 :63880 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145917-fjhaj183/files/wandb-summary.json -diff --git a/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb b/wandb/run-20220408_145917-fjhaj183/run-fjhaj183.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py b/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml b/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml b/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -deleted file mode 100644 -index 39ea9ed..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 16 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch b/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -deleted file mode 100644 -index 3de404c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/diff.patch -+++ /dev/null -@@ -1,230 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..1036f20 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,91 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..33a9122 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..622b540 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_145943-fjlzyv53/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c775116 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_145943-fjlzyv53 --\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/output.log b/wandb/run-20220408_145943-fjlzyv53/files/output.log -deleted file mode 100644 -index 0a584f7..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt b/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -deleted file mode 100644 -index 321b5fe..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:29:44.714511", -- "startedAt": "2022-04-08T09:29:43.530748", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=16", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json b/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -deleted file mode 100644 -index 43fa534..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.180241584777832, "_runtime": 16, "_timestamp": 1649410199, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -deleted file mode 100644 -index 1bb5ef6..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,540 DEBUG MainThread:64131 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send():179] send: header --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,544 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: check_version --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:43,999 DEBUG SenderThread:64131 [sender.py:send():179] send: run --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:44,712 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():39] meta init --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:__init__():53] meta init done --2022-04-08 14:59:44,714 DEBUG HandlerThread:64131 [meta.py:probe():210] probe --2022-04-08 14:59:44,720 DEBUG HandlerThread:64131 [meta.py:_setup_git():200] setup git --2022-04-08 14:59:44,739 DEBUG HandlerThread:64131 [meta.py:_setup_git():207] setup git done --2022-04-08 14:59:44,740 DEBUG HandlerThread:64131 [meta.py:_save_code():89] save code --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_code():110] save code done --2022-04-08 14:59:44,748 DEBUG HandlerThread:64131 [meta.py:_save_patches():127] save patches --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_patches():169] save patches done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():57] save pip --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_pip():71] save pip done --2022-04-08 14:59:44,809 DEBUG HandlerThread:64131 [meta.py:_save_conda():78] save conda --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:_save_conda():86] save conda done --2022-04-08 14:59:46,120 DEBUG HandlerThread:64131 [meta.py:probe():252] probe done --2022-04-08 14:59:46,122 DEBUG SenderThread:64131 [sender.py:send():179] send: files --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 DEBUG HandlerThread:64131 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,133 DEBUG SenderThread:64131 [sender.py:send_request():193] send_request: stop_status --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,710 DEBUG SenderThread:64131 [sender.py:send():179] send: config --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: history --2022-04-08 14:59:59,111 DEBUG SenderThread:64131 [sender.py:send():179] send: summary --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log b/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -deleted file mode 100644 -index 042323c..0000000 ---- a/wandb/run-20220408_145943-fjlzyv53/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fjlzyv53', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml', 'start_method': 'thread'} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug.log --2022-04-08 14:59:43,531 INFO MainThread:64131 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/logs/debug-internal.log --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():369] calling init triggers --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 32, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 1024, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:43,532 INFO MainThread:64131 [wandb_init.py:init():418] starting backend --2022-04-08 14:59:43,537 INFO MainThread:64131 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 14:59:43,538 INFO wandb_internal:64131 [internal.py:wandb_internal():91] W&B internal server running at pid: 64131, started at: 2022-04-08 14:59:43.537952 --2022-04-08 14:59:43,539 INFO MainThread:64131 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 14:59:43,540 INFO MainThread:64131 [wandb_init.py:init():423] backend started and connected --2022-04-08 14:59:43,541 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 16, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 6, 'nlayers': 2} --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():465] updated telemetry --2022-04-08 14:59:43,543 INFO MainThread:64131 [wandb_init.py:init():484] communicating current version --2022-04-08 14:59:43,544 INFO WriterThread:64131 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 14:59:43,999 INFO MainThread:64131 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 14:59:44,710 INFO SenderThread:64131 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files --2022-04-08 14:59:44,710 INFO SenderThread:64131 [sender.py:_start_run_threads():707] run started: fjlzyv53 with start time 1649410183 --2022-04-08 14:59:44,711 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:44,711 INFO MainThread:64131 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code/train_translation.py --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/requirements.txt --2022-04-08 14:59:45,711 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/diff.patch --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:45,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/code --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 14:59:46,122 INFO SenderThread:64131 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 14:59:46,123 INFO SenderThread:64131 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 14:59:46,133 INFO MainThread:64131 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 14:59:46,135 INFO MainThread:64131 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 14:59:46,137 INFO MainThread:64131 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 14:59:46,712 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/conda-environment.yaml --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-metadata.json --2022-04-08 14:59:46,713 INFO Thread-11 :64131 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:47,796 INFO Thread-14 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3fbo2hr0-wandb-metadata.json --2022-04-08 14:59:47,797 INFO Thread-16 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/pqn45v2p-diff.patch --2022-04-08 14:59:47,800 INFO Thread-15 :64131 [upload_job.py:push():133] Uploaded file /tmp/tmpuhuvd94zwandb/3862f493-code/train_translation.py --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/config.yaml --2022-04-08 14:59:48,715 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:50,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:52,716 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/output.log --2022-04-08 14:59:59,114 INFO SenderThread:64131 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 14:59:59,769 INFO Thread-11 :64131 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_145943-fjlzyv53/files/wandb-summary.json -diff --git a/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb b/wandb/run-20220408_145943-fjlzyv53/run-fjlzyv53.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py b/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml b/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150006-abict4v2/files/config.yaml b/wandb/run-20220408_150006-abict4v2/files/config.yaml -deleted file mode 100644 -index 55505a9..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 20 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 8 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150006-abict4v2/files/diff.patch b/wandb/run-20220408_150006-abict4v2/files/diff.patch -deleted file mode 100644 -index cae01c4..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/diff.patch -+++ /dev/null -@@ -1,232 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..a79a795 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,93 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..baa82b6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..79d1f8d 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150006-abict4v2/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..4572147 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150006-abict4v2 --\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/output.log b/wandb/run-20220408_150006-abict4v2/files/output.log -deleted file mode 100644 -index 18438a2..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/output.log -+++ /dev/null -@@ -1,14 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:261: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -diff --git a/wandb/run-20220408_150006-abict4v2/files/requirements.txt b/wandb/run-20220408_150006-abict4v2/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json b/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -deleted file mode 100644 -index f46fef8..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:08.569102", -- "startedAt": "2022-04-08T09:30:06.988517", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=20", -- "--nhead=8", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json b/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -deleted file mode 100644 -index 4c47552..0000000 ---- a/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.120020389556885, "_runtime": 21, "_timestamp": 1649410227, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log b/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -deleted file mode 100644 -index eb4114e..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log -+++ /dev/null -@@ -1,71 +0,0 @@ --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,998 DEBUG MainThread:64393 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send():179] send: header --2022-04-08 15:00:07,002 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:07,002 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:07,447 DEBUG SenderThread:64393 [sender.py:send():179] send: run --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,565 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:08,566 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:08,568 DEBUG HandlerThread:64393 [meta.py:__init__():39] meta init --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:__init__():53] meta init done --2022-04-08 15:00:08,569 DEBUG HandlerThread:64393 [meta.py:probe():210] probe --2022-04-08 15:00:08,574 DEBUG HandlerThread:64393 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:08,594 DEBUG HandlerThread:64393 [meta.py:_save_code():89] save code --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_code():110] save code done --2022-04-08 15:00:08,603 DEBUG HandlerThread:64393 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:08,656 DEBUG HandlerThread:64393 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:08,657 DEBUG HandlerThread:64393 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:10,003 DEBUG HandlerThread:64393 [meta.py:probe():252] probe done --2022-04-08 15:00:10,005 DEBUG SenderThread:64393 [sender.py:send():179] send: files --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:11,189 DEBUG SenderThread:64393 [sender.py:send():179] send: config --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:26,191 DEBUG HandlerThread:64393 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:26,191 DEBUG SenderThread:64393 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: history --2022-04-08 15:00:27,421 DEBUG SenderThread:64393 [sender.py:send():179] send: summary --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/logs/debug.log b/wandb/run-20220408_150006-abict4v2/logs/debug.log -deleted file mode 100644 -index 2782e5f..0000000 ---- a/wandb/run-20220408_150006-abict4v2/logs/debug.log -+++ /dev/null -@@ -1,51 +0,0 @@ --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'abict4v2', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-abict4v2.yaml', 'start_method': 'thread'} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/logs/debug-internal.log --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:06,989 INFO MainThread:64393 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --config: {'workers': 4, 'epochs': 20, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 8, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:06,990 INFO MainThread:64393 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:06,995 INFO MainThread:64393 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:06,996 INFO wandb_internal:64393 [internal.py:wandb_internal():91] W&B internal server running at pid: 64393, started at: 2022-04-08 15:00:06.995764 --2022-04-08 15:00:06,996 INFO MainThread:64393 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:06,997 INFO MainThread:64393 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:06,999 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 20, 'nhead': 8, 'nlayers': 6} --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:07,001 INFO MainThread:64393 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:07,002 INFO WriterThread:64393 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:07,446 INFO MainThread:64393 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:08,564 INFO SenderThread:64393 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files --2022-04-08 15:00:08,564 INFO SenderThread:64393 [sender.py:_start_run_threads():707] run started: abict4v2 with start time 1649410206 --2022-04-08 15:00:08,566 INFO MainThread:64393 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:08,566 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:09,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/requirements.txt --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code/train_translation.py --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/diff.patch --2022-04-08 15:00:09,567 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/code --2022-04-08 15:00:10,005 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:10,006 INFO SenderThread:64393 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:10,007 INFO SenderThread:64393 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:10,014 INFO MainThread:64393 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:10,015 INFO MainThread:64393 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:10,018 INFO MainThread:64393 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:10,019 INFO MainThread:64393 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/conda-environment.yaml --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:10,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-metadata.json --2022-04-08 15:00:12,363 INFO Thread-14 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/166an6d7-wandb-metadata.json --2022-04-08 15:00:12,365 INFO Thread-20 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/1a4gpeq3-diff.patch --2022-04-08 15:00:12,565 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:12,588 INFO Thread-15 :64393 [upload_job.py:push():133] Uploaded file /tmp/tmplw_yhgi2wandb/2g7bx28s-code/train_translation.py --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:14,566 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/config.yaml --2022-04-08 15:00:18,643 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:20,644 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/output.log --2022-04-08 15:00:27,424 INFO SenderThread:64393 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:27,647 INFO Thread-11 :64393 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150006-abict4v2/files/wandb-summary.json -diff --git a/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb b/wandb/run-20220408_150006-abict4v2/run-abict4v2.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py b/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -deleted file mode 100644 -index e482ba7..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py -+++ /dev/null -@@ -1,364 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml b/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml b/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -deleted file mode 100644 -index ea14f0e..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 64 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch b/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -deleted file mode 100644 -index 47b804f..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/diff.patch -+++ /dev/null -@@ -1,234 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2248477 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,95 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..e482ba7 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -299,7 +299,9 @@ def main_worker(gpu, args): -- predicted.pop() -- target.pop() -- --- print(bleu_score(predicted, target)) --+ bleu_score = bleu_score(predicted, target) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,7 +313,7 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..165ed2c 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..f1325dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_150037-ba0yl54z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..1413293 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_150037-ba0yl54z --\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/output.log b/wandb/run-20220408_150037-ba0yl54z/files/output.log -deleted file mode 100644 -index 6742216..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt b/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -deleted file mode 100644 -index 5a492ae..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T09:30:38.254663", -- "startedAt": "2022-04-08T09:30:37.394479", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=64", -- "--dfeedforward=512", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json b/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -deleted file mode 100644 -index 662ac89..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.082856178283691, "_runtime": 16, "_timestamp": 1649410253, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -deleted file mode 100644 -index 0c041a1..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,404 DEBUG MainThread:64646 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 DEBUG SenderThread:64646 [sender.py:send():179] send: header --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,410 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:00:37,410 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:37,611 DEBUG SenderThread:64646 [sender.py:send():179] send: run --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:38,252 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():39] meta init --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:__init__():53] meta init done --2022-04-08 15:00:38,254 DEBUG HandlerThread:64646 [meta.py:probe():210] probe --2022-04-08 15:00:38,260 DEBUG HandlerThread:64646 [meta.py:_setup_git():200] setup git --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_setup_git():207] setup git done --2022-04-08 15:00:38,280 DEBUG HandlerThread:64646 [meta.py:_save_code():89] save code --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_code():110] save code done --2022-04-08 15:00:38,289 DEBUG HandlerThread:64646 [meta.py:_save_patches():127] save patches --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_patches():169] save patches done --2022-04-08 15:00:38,341 DEBUG HandlerThread:64646 [meta.py:_save_pip():57] save pip --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_pip():71] save pip done --2022-04-08 15:00:38,342 DEBUG HandlerThread:64646 [meta.py:_save_conda():78] save conda --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:_save_conda():86] save conda done --2022-04-08 15:00:39,663 DEBUG HandlerThread:64646 [meta.py:probe():252] probe done --2022-04-08 15:00:39,665 DEBUG SenderThread:64646 [sender.py:send():179] send: files --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,676 DEBUG HandlerThread:64646 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:00:39,676 DEBUG SenderThread:64646 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:40,430 DEBUG SenderThread:64646 [sender.py:send():179] send: config --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: history --2022-04-08 15:00:53,735 DEBUG SenderThread:64646 [sender.py:send():179] send: summary --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log b/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -deleted file mode 100644 -index 4346748..0000000 ---- a/wandb/run-20220408_150037-ba0yl54z/logs/debug.log -+++ /dev/null -@@ -1,50 +0,0 @@ --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'lrpyor0l', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'ba0yl54z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml', 'start_method': 'thread'} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/logs/debug-internal.log --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --config: {'workers': 4, 'epochs': 32, 'batch_size': 64, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 512, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:37,395 INFO MainThread:64646 [wandb_init.py:init():418] starting backend --2022-04-08 15:00:37,401 INFO MainThread:64646 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:00:37,402 INFO wandb_internal:64646 [internal.py:wandb_internal():91] W&B internal server running at pid: 64646, started at: 2022-04-08 15:00:37.401702 --2022-04-08 15:00:37,402 INFO MainThread:64646 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:00:37,404 INFO MainThread:64646 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:00:37,406 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 64, 'dfeedforward': 512, 'epochs': 32, 'nhead': 2, 'nlayers': 6} --2022-04-08 15:00:37,408 INFO MainThread:64646 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:00:37,409 INFO MainThread:64646 [wandb_init.py:init():484] communicating current version --2022-04-08 15:00:37,409 INFO WriterThread:64646 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:00:37,610 INFO MainThread:64646 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:00:38,249 INFO SenderThread:64646 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files --2022-04-08 15:00:38,250 INFO SenderThread:64646 [sender.py:_start_run_threads():707] run started: ba0yl54z with start time 1649410237 --2022-04-08 15:00:38,251 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:38,252 INFO MainThread:64646 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/requirements.txt --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/diff.patch --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code/train_translation.py --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:39,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/code --2022-04-08 15:00:39,665 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:00:39,666 INFO SenderThread:64646 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:00:39,667 INFO SenderThread:64646 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:00:39,676 INFO MainThread:64646 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:00:39,678 INFO MainThread:64646 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:00:39,680 INFO MainThread:64646 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/conda-environment.yaml --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-metadata.json --2022-04-08 15:00:40,250 INFO Thread-11 :64646 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:41,110 INFO Thread-16 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1bd5x3gn-diff.patch --2022-04-08 15:00:41,186 INFO Thread-15 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1kw8gilq-code/train_translation.py --2022-04-08 15:00:41,285 INFO Thread-14 :64646 [upload_job.py:push():133] Uploaded file /tmp/tmpy70agkq_wandb/1nmym46e-wandb-metadata.json --2022-04-08 15:00:42,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:43,251 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/config.yaml --2022-04-08 15:00:46,252 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:48,253 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/output.log --2022-04-08 15:00:53,737 INFO SenderThread:64646 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:00:54,255 INFO Thread-11 :64646 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_150037-ba0yl54z/files/wandb-summary.json -diff --git a/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb b/wandb/run-20220408_150037-ba0yl54z/run-ba0yl54z.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py b/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml b/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml b/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -deleted file mode 100644 -index 546bdaa..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 512 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 16 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch b/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -deleted file mode 100644 -index c98ba4e..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/diff.patch -+++ /dev/null -@@ -1,285 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ea51a40 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,97 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f8e98b2 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..9304e2b 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153004-dg43ixc4/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b02872b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153004-dg43ixc4 --\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/output.log b/wandb/run-20220408_153004-dg43ixc4/files/output.log -deleted file mode 100644 -index f49019d..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/output.log -+++ /dev/null -@@ -1,11 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt b/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -deleted file mode 100644 -index 109e1b6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:05.796412", -- "startedAt": "2022-04-08T10:00:04.837672", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=512", -- "--epochs=16", -- "--nhead=6", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json b/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -deleted file mode 100644 -index 09cdda6..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.140233993530273, "_runtime": 15, "_timestamp": 1649412019, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -deleted file mode 100644 -index 9669aaf..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log -+++ /dev/null -@@ -1,67 +0,0 @@ --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,848 DEBUG MainThread:65348 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,851 DEBUG SenderThread:65348 [sender.py:send():179] send: header --2022-04-08 15:30:04,851 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:04,852 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,022 DEBUG SenderThread:65348 [sender.py:send():179] send: run --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:05,794 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():39] meta init --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:__init__():53] meta init done --2022-04-08 15:30:05,796 DEBUG HandlerThread:65348 [meta.py:probe():210] probe --2022-04-08 15:30:05,802 DEBUG HandlerThread:65348 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:05,821 DEBUG HandlerThread:65348 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:05,822 DEBUG HandlerThread:65348 [meta.py:_save_code():89] save code --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_code():110] save code done --2022-04-08 15:30:05,831 DEBUG HandlerThread:65348 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:05,886 DEBUG HandlerThread:65348 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:07,220 DEBUG HandlerThread:65348 [meta.py:probe():252] probe done --2022-04-08 15:30:07,221 DEBUG SenderThread:65348 [sender.py:send():179] send: files --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,232 DEBUG HandlerThread:65348 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:07,233 DEBUG SenderThread:65348 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,677 DEBUG SenderThread:65348 [sender.py:send():179] send: config --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: history --2022-04-08 15:30:19,407 DEBUG SenderThread:65348 [sender.py:send():179] send: summary --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log b/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -deleted file mode 100644 -index 66c14b1..0000000 ---- a/wandb/run-20220408_153004-dg43ixc4/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'dg43ixc4', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml', 'start_method': 'thread'} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/logs/debug-internal.log --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --config: {'workers': 4, 'epochs': 16, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 512, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:04,839 INFO MainThread:65348 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:04,845 INFO MainThread:65348 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:04,846 INFO wandb_internal:65348 [internal.py:wandb_internal():91] W&B internal server running at pid: 65348, started at: 2022-04-08 15:30:04.845569 --2022-04-08 15:30:04,846 INFO MainThread:65348 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:04,848 INFO MainThread:65348 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:04,849 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 512, 'epochs': 16, 'nhead': 6, 'nlayers': 4} --2022-04-08 15:30:04,850 INFO MainThread:65348 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:04,851 INFO MainThread:65348 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:04,852 INFO WriterThread:65348 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:05,021 INFO MainThread:65348 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:05,792 INFO SenderThread:65348 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files --2022-04-08 15:30:05,792 INFO SenderThread:65348 [sender.py:_start_run_threads():707] run started: dg43ixc4 with start time 1649412004 --2022-04-08 15:30:05,793 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:05,793 INFO MainThread:65348 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code/train_translation.py --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/requirements.txt --2022-04-08 15:30:06,794 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/diff.patch --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:06,795 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/code --2022-04-08 15:30:07,222 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:07,223 INFO SenderThread:65348 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:07,232 INFO MainThread:65348 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:07,234 INFO MainThread:65348 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:07,235 INFO MainThread:65348 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:07,236 INFO MainThread:65348 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/conda-environment.yaml --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-metadata.json --2022-04-08 15:30:07,792 INFO Thread-11 :65348 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:08,525 INFO Thread-16 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/npor673v-diff.patch --2022-04-08 15:30:08,527 INFO Thread-14 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/1fwboqq3-wandb-metadata.json --2022-04-08 15:30:08,548 INFO Thread-15 :65348 [upload_job.py:push():133] Uploaded file /tmp/tmpuu5pqhpgwandb/2pescb75-code/train_translation.py --2022-04-08 15:30:09,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:09,943 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/config.yaml --2022-04-08 15:30:11,936 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/output.log --2022-04-08 15:30:19,409 INFO SenderThread:65348 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:19,939 INFO Thread-11 :65348 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153004-dg43ixc4/files/wandb-summary.json -diff --git a/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb b/wandb/run-20220408_153004-dg43ixc4/run-dg43ixc4.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py b/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -deleted file mode 100644 -index 52a946e..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py -+++ /dev/null -@@ -1,370 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- --############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) --# print(bleu_score(predicted, target)) --############################################################## --# if epoch%1 ==0 : --# torch.save(model.module.state_dict(), --# 'path.pth') --# print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml b/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml b/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -deleted file mode 100644 -index 122f33a..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/config.yaml -+++ /dev/null -@@ -1,101 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 256 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 6 --nlayers: -- desc: null -- value: 2 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch b/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -deleted file mode 100644 -index 797f0a1..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/diff.patch -+++ /dev/null -@@ -1,287 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..356076f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,99 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..52a946e 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -143,9 +143,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -256,7 +256,7 @@ def main_worker(gpu, args): -- optimizer.step() -- # losses += loss.item() -- ---# wandb.log({'iter_loss': loss}) --+ wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- --@@ -267,7 +267,7 @@ def main_worker(gpu, args): -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --+ wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --@@ -279,27 +279,9 @@ def main_worker(gpu, args): -- ############################################################## -- if epoch%args.checkbleu ==0 : -- --- model.eval() --- predicted=[] --- target=[] --- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --- --- print(bleu_score(predicted, target)) --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+# print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), --@@ -311,10 +293,36 @@ def main_worker(gpu, args): -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --+ wandb.finish() -- -- -- --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] --+ --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() --+ --+ bleu_score = bleu_score(predicted, target) --+ --+ return bleu_score --+ -- ''' -- todo: -- BLEU score --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7b452fc 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..48b2ecd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220408_153027-fwwd5rya/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..93be230 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220408_153027-fwwd5rya --\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/output.log b/wandb/run-20220408_153027-fwwd5rya/files/output.log -deleted file mode 100644 -index e86aeca..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-17: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt b/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -deleted file mode 100644 -index dcac75d..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-08T10:00:27.794832", -- "startedAt": "2022-04-08T10:00:27.031889", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=256", -- "--dfeedforward=256", -- "--epochs=40", -- "--nhead=6", -- "--nlayers=2" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json b/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -deleted file mode 100644 -index e70a2b8..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log -+++ /dev/null -@@ -1,99 +0,0 @@ --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,040 DEBUG MainThread:65601 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,046 DEBUG SenderThread:65601 [sender.py:send():179] send: header --2022-04-08 15:30:27,046 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: check_version --2022-04-08 15:30:27,047 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: check_version --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,254 DEBUG SenderThread:65601 [sender.py:send():179] send: run --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 DEBUG SenderThread:65601 [sender.py:send():179] send: summary --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:27,792 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: run_start --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():39] meta init --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:__init__():53] meta init done --2022-04-08 15:30:27,794 DEBUG HandlerThread:65601 [meta.py:probe():210] probe --2022-04-08 15:30:27,800 DEBUG HandlerThread:65601 [meta.py:_setup_git():200] setup git --2022-04-08 15:30:27,819 DEBUG HandlerThread:65601 [meta.py:_setup_git():207] setup git done --2022-04-08 15:30:27,820 DEBUG HandlerThread:65601 [meta.py:_save_code():89] save code --2022-04-08 15:30:27,828 DEBUG HandlerThread:65601 [meta.py:_save_code():110] save code done --2022-04-08 15:30:27,829 DEBUG HandlerThread:65601 [meta.py:_save_patches():127] save patches --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_patches():169] save patches done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():57] save pip --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_pip():71] save pip done --2022-04-08 15:30:27,882 DEBUG HandlerThread:65601 [meta.py:_save_conda():78] save conda --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:_save_conda():86] save conda done --2022-04-08 15:30:29,200 DEBUG HandlerThread:65601 [meta.py:probe():252] probe done --2022-04-08 15:30:29,202 DEBUG SenderThread:65601 [sender.py:send():179] send: files --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 DEBUG HandlerThread:65601 [handler.py:handle_request():124] handle_request: stop_status --2022-04-08 15:30:29,214 DEBUG SenderThread:65601 [sender.py:send_request():193] send_request: stop_status --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,791 DEBUG SenderThread:65601 [sender.py:send():179] send: config --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log b/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -deleted file mode 100644 -index 987c5d6..0000000 ---- a/wandb/run-20220408_153027-fwwd5rya/logs/debug.log -+++ /dev/null -@@ -1,84 +0,0 @@ --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': 'q27ijx1y', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'fwwd5rya', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml', 'start_method': 'thread'} --2022-04-08 15:30:27,032 INFO MainThread:65601 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/logs/debug-internal.log --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():369] calling init triggers --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --config: {'workers': 4, 'epochs': 40, 'batch_size': 256, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 6, 'dfeedforward': 256, 'nlayers': 2, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:27,033 INFO MainThread:65601 [wandb_init.py:init():418] starting backend --2022-04-08 15:30:27,038 INFO MainThread:65601 [backend.py:ensure_launched():132] starting backend process... --2022-04-08 15:30:27,039 INFO MainThread:65601 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-08 15:30:27,040 INFO wandb_internal:65601 [internal.py:wandb_internal():91] W&B internal server running at pid: 65601, started at: 2022-04-08 15:30:27.039181 --2022-04-08 15:30:27,040 INFO MainThread:65601 [wandb_init.py:init():423] backend started and connected --2022-04-08 15:30:27,043 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 256, 'dfeedforward': 256, 'epochs': 40, 'nhead': 6, 'nlayers': 2} --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():465] updated telemetry --2022-04-08 15:30:27,045 INFO MainThread:65601 [wandb_init.py:init():484] communicating current version --2022-04-08 15:30:27,046 INFO WriterThread:65601 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:27,253 INFO MainThread:65601 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.13 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-08 15:30:27,254 INFO MainThread:65601 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-08 15:30:27,789 INFO SenderThread:65601 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:27,789 INFO SenderThread:65601 [sender.py:_start_run_threads():707] run started: fwwd5rya with start time 1649412027 --2022-04-08 15:30:27,791 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-08 15:30:27,792 INFO MainThread:65601 [wandb_init.py:init():522] starting run threads in backend --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:28,791 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch --2022-04-08 15:30:28,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code --2022-04-08 15:30:29,202 INFO SenderThread:65601 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-08 15:30:29,203 INFO SenderThread:65601 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-08 15:30:29,213 INFO MainThread:65601 [wandb_run.py:_console_start():1538] atexit reg --2022-04-08 15:30:29,214 INFO MainThread:65601 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-08 15:30:29,215 INFO MainThread:65601 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-08 15:30:29,216 INFO MainThread:65601 [wandb_init.py:init():547] run started, returning control to user process --2022-04-08 15:30:29,218 INFO MainThread:65601 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-08 15:30:29,792 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json --2022-04-08 15:30:29,793 INFO Thread-11 :65601 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:30,468 INFO Thread-14 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/wm4wxh62-wandb-metadata.json --2022-04-08 15:30:30,483 INFO Thread-15 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/12sn1grf-code/train_translation.py --2022-04-08 15:30:30,586 INFO Thread-16 :65601 [upload_job.py:push():133] Uploaded file /tmp/tmp_5d66la0wandb/1yya4rls-diff.patch --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:31,796 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:33,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:35,797 INFO Thread-11 :65601 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:36,051 WARNING wandb_internal:65601 [internal.py:is_dead():367] Internal process exiting, parent pid 65592 disappeared --2022-04-08 15:30:36,051 ERROR wandb_internal:65601 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-08 15:30:36,225 INFO WriterThread:65601 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb --2022-04-08 15:30:36,225 INFO SenderThread:65601 [sender.py:finish():933] shutting down sender --2022-04-08 15:30:36,225 INFO SenderThread:65601 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt requirements.txt --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-metadata.json wandb-metadata.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log output.log --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml conda-environment.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json wandb-summary.json --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml config.yaml --2022-04-08 15:30:36,798 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/diff.patch diff.patch --2022-04-08 15:30:36,800 INFO SenderThread:65601 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/code/train_translation.py code/train_translation.py --2022-04-08 15:30:36,800 INFO SenderThread:65601 [file_pusher.py:finish():176] shutting down file pusher --2022-04-08 15:30:36,801 INFO SenderThread:65601 [file_pusher.py:join():181] waiting for file pusher --2022-04-08 15:30:38,053 INFO Thread-27 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/config.yaml --2022-04-08 15:30:38,054 INFO Thread-25 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/conda-environment.yaml --2022-04-08 15:30:38,246 INFO Thread-23 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/requirements.txt --2022-04-08 15:30:38,247 INFO Thread-24 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/output.log --2022-04-08 15:30:38,687 INFO Thread-26 :65601 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220408_153027-fwwd5rya/files/wandb-summary.json --2022-04-08 15:30:40,967 ERROR wandb_internal:65601 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError -diff --git a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb b/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb -deleted file mode 100644 -index bfb12ff..0000000 -Binary files a/wandb/run-20220408_153027-fwwd5rya/run-fwwd5rya.wandb and /dev/null differ -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py b/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml b/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml b/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch b/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -deleted file mode 100644 -index bd71761..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/diff.patch -+++ /dev/null -@@ -1,377 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..d3a775c 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,100 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..74ec524 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..c957937 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152616-3a3gw94y/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..287708f 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152616-3a3gw94y --\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/output.log b/wandb/run-20220409_152616-3a3gw94y/files/output.log -deleted file mode 100644 -index 13e9c3e..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt b/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -deleted file mode 100644 -index 20f0482..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:56:17.429229", -- "startedAt": "2022-04-09T09:56:16.815816", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json b/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -deleted file mode 100644 -index 5602f92..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 16, "_timestamp": 1649498192, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -deleted file mode 100644 -index 2546fd3..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,824 DEBUG MainThread:3266 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,828 DEBUG SenderThread:3266 [sender.py:send():179] send: header --2022-04-09 15:26:16,829 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:16,984 DEBUG SenderThread:3266 [sender.py:send():179] send: run --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:17,426 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():39] meta init --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:__init__():53] meta init done --2022-04-09 15:26:17,429 DEBUG HandlerThread:3266 [meta.py:probe():210] probe --2022-04-09 15:26:17,435 DEBUG HandlerThread:3266 [meta.py:_setup_git():200] setup git --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_setup_git():207] setup git done --2022-04-09 15:26:17,450 DEBUG HandlerThread:3266 [meta.py:_save_code():89] save code --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_code():110] save code done --2022-04-09 15:26:17,456 DEBUG HandlerThread:3266 [meta.py:_save_patches():127] save patches --2022-04-09 15:26:17,564 DEBUG HandlerThread:3266 [meta.py:_save_patches():169] save patches done --2022-04-09 15:26:17,565 DEBUG HandlerThread:3266 [meta.py:_save_pip():57] save pip --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_pip():71] save pip done --2022-04-09 15:26:17,566 DEBUG HandlerThread:3266 [meta.py:_save_conda():78] save conda --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:_save_conda():86] save conda done --2022-04-09 15:26:19,487 DEBUG HandlerThread:3266 [meta.py:probe():252] probe done --2022-04-09 15:26:19,491 DEBUG SenderThread:3266 [sender.py:send():179] send: files --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 DEBUG HandlerThread:3266 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:26:19,497 DEBUG SenderThread:3266 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:19,831 DEBUG SenderThread:3266 [sender.py:send():179] send: config --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: history --2022-04-09 15:26:32,511 DEBUG SenderThread:3266 [sender.py:send():179] send: summary --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log b/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -deleted file mode 100644 -index ebbf034..0000000 ---- a/wandb/run-20220409_152616-3a3gw94y/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/logs/debug-internal.log --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:16,817 INFO MainThread:3266 [wandb_init.py:init():418] starting backend --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:26:16,822 INFO MainThread:3266 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:26:16,823 INFO wandb_internal:3266 [internal.py:wandb_internal():91] W&B internal server running at pid: 3266, started at: 2022-04-09 15:26:16.822572 --2022-04-09 15:26:16,823 INFO MainThread:3266 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:26:16,827 INFO MainThread:3266 [wandb_init.py:init():484] communicating current version --2022-04-09 15:26:16,828 INFO WriterThread:3266 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb --2022-04-09 15:26:16,980 INFO MainThread:3266 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:26:16,981 INFO MainThread:3266 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:26:17,424 INFO SenderThread:3266 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files --2022-04-09 15:26:17,424 INFO SenderThread:3266 [sender.py:_start_run_threads():707] run started: 3a3gw94y with start time 1649498176 --2022-04-09 15:26:17,425 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:17,426 INFO MainThread:3266 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code/train_translation.py --2022-04-09 15:26:18,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/requirements.txt --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/diff.patch --2022-04-09 15:26:18,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/code --2022-04-09 15:26:19,424 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/conda-environment.yaml --2022-04-09 15:26:19,491 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:26:19,492 INFO SenderThread:3266 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:26:19,497 INFO MainThread:3266 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:26:19,501 INFO MainThread:3266 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:26:19,502 INFO MainThread:3266 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:26:19,505 INFO MainThread:3266 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:20,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-metadata.json --2022-04-09 15:26:20,885 INFO Thread-14 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1te7qq4j-wandb-metadata.json --2022-04-09 15:26:20,887 INFO Thread-22 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/tiwzm18e-diff.patch --2022-04-09 15:26:20,888 INFO Thread-17 :3266 [upload_job.py:push():133] Uploaded file /tmp/tmpegocdq1xwandb/1x2d20v2-code/train_translation.py --2022-04-09 15:26:21,425 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/config.yaml --2022-04-09 15:26:22,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:24,426 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:26,427 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/output.log --2022-04-09 15:26:32,514 INFO SenderThread:3266 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:26:33,430 INFO Thread-11 :3266 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152616-3a3gw94y/files/wandb-summary.json -diff --git a/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb b/wandb/run-20220409_152616-3a3gw94y/run-3a3gw94y.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py b/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -deleted file mode 100644 -index 197ab25..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu_score = bleu_score(predicted, target) -- -- return bleu_score -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml b/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml b/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch b/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -deleted file mode 100644 -index c3ed101..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/diff.patch -+++ /dev/null -@@ -1,379 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..ed88fe4 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,102 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..197ab25 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu_score = bleu_score(predicted, target) -- --+ return bleu_score -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..4895794 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..1f9d48c 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_152708-15jgzcwp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..dfe2dcb 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_152708-15jgzcwp --\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/output.log b/wandb/run-20220409_152708-15jgzcwp/files/output.log -deleted file mode 100644 -index 9a9a49f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt b/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -deleted file mode 100644 -index abaad7d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T09:57:09.613679", -- "startedAt": "2022-04-09T09:57:08.966939", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json b/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -deleted file mode 100644 -index 0164a0d..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 12, "_timestamp": 1649498241, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -deleted file mode 100644 -index de7918e..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,002 DEBUG MainThread:3540 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,017 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send():179] send: header --2022-04-09 15:27:09,018 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: check_version --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,109 DEBUG SenderThread:3540 [sender.py:send():179] send: run --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:09,611 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():39] meta init --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:__init__():53] meta init done --2022-04-09 15:27:09,613 DEBUG HandlerThread:3540 [meta.py:probe():210] probe --2022-04-09 15:27:09,619 DEBUG HandlerThread:3540 [meta.py:_setup_git():200] setup git --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_setup_git():207] setup git done --2022-04-09 15:27:09,636 DEBUG HandlerThread:3540 [meta.py:_save_code():89] save code --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_code():110] save code done --2022-04-09 15:27:09,644 DEBUG HandlerThread:3540 [meta.py:_save_patches():127] save patches --2022-04-09 15:27:09,693 DEBUG HandlerThread:3540 [meta.py:_save_patches():169] save patches done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():57] save pip --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_pip():71] save pip done --2022-04-09 15:27:09,694 DEBUG HandlerThread:3540 [meta.py:_save_conda():78] save conda --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,002 DEBUG HandlerThread:3540 [meta.py:_save_conda():86] save conda done --2022-04-09 15:27:11,003 DEBUG HandlerThread:3540 [meta.py:probe():252] probe done --2022-04-09 15:27:11,004 DEBUG SenderThread:3540 [sender.py:send():179] send: files --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 DEBUG HandlerThread:3540 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 DEBUG SenderThread:3540 [sender.py:send_request():193] send_request: stop_status --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,362 DEBUG SenderThread:3540 [sender.py:send():179] send: config --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: history --2022-04-09 15:27:21,558 DEBUG SenderThread:3540 [sender.py:send():179] send: summary --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log b/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -deleted file mode 100644 -index 023162f..0000000 ---- a/wandb/run-20220409_152708-15jgzcwp/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 15:27:08,971 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug.log --2022-04-09 15:27:08,972 INFO MainThread:3540 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/logs/debug-internal.log --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():369] calling init triggers --2022-04-09 15:27:08,973 INFO MainThread:3540 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:08,974 INFO MainThread:3540 [wandb_init.py:init():418] starting backend --2022-04-09 15:27:08,994 INFO MainThread:3540 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 15:27:08,996 INFO MainThread:3540 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 15:27:08,998 INFO wandb_internal:3540 [internal.py:wandb_internal():91] W&B internal server running at pid: 3540, started at: 2022-04-09 15:27:08.995965 --2022-04-09 15:27:09,002 INFO MainThread:3540 [wandb_init.py:init():423] backend started and connected --2022-04-09 15:27:09,013 INFO MainThread:3540 [wandb_init.py:init():465] updated telemetry --2022-04-09 15:27:09,014 INFO MainThread:3540 [wandb_init.py:init():484] communicating current version --2022-04-09 15:27:09,016 INFO WriterThread:3540 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 15:27:09,107 INFO MainThread:3540 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 15:27:09,608 INFO SenderThread:3540 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files --2022-04-09 15:27:09,608 INFO SenderThread:3540 [sender.py:_start_run_threads():707] run started: 15jgzcwp with start time 1649498229 --2022-04-09 15:27:09,610 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:09,610 INFO MainThread:3540 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/requirements.txt --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code/train_translation.py --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/diff.patch --2022-04-09 15:27:10,609 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/code --2022-04-09 15:27:11,004 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 15:27:11,005 INFO SenderThread:3540 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 15:27:11,006 INFO SenderThread:3540 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 15:27:11,013 INFO MainThread:3540 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 15:27:11,015 INFO MainThread:3540 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 15:27:11,017 INFO MainThread:3540 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 15:27:11,018 INFO MainThread:3540 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/conda-environment.yaml --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:11,608 INFO Thread-11 :3540 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-metadata.json --2022-04-09 15:27:11,957 INFO Thread-18 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/r7pplw70-diff.patch --2022-04-09 15:27:12,433 INFO Thread-15 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/2g6gfxwx-code/train_translation.py --2022-04-09 15:27:12,434 INFO Thread-14 :3540 [upload_job.py:push():133] Uploaded file /tmp/tmp9sk6_xjuwandb/1mjjo7ai-wandb-metadata.json --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:13,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/config.yaml --2022-04-09 15:27:15,610 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:17,611 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/output.log --2022-04-09 15:27:21,560 INFO SenderThread:3540 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 15:27:21,613 INFO Thread-11 :3540 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_152708-15jgzcwp/files/wandb-summary.json -diff --git a/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb b/wandb/run-20220409_152708-15jgzcwp/run-15jgzcwp.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py b/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -deleted file mode 100644 -index 596bd8d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml b/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch b/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -deleted file mode 100644 -index edba74d..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch -+++ /dev/null -@@ -1,457 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..6f7f3e6 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,180 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..596bd8d 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..7064436 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..3ee4416 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160115-yr1wk5mi/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..425ec98 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160115-yr1wk5mi --\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/output.log b/wandb/run-20220409_160115-yr1wk5mi/files/output.log -deleted file mode 100644 -index e872735..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/output.log -+++ /dev/null -@@ -1,6 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt b/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -deleted file mode 100644 -index 39bdbe7..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:31:16.739157", -- "startedAt": "2022-04-09T10:31:15.626079", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json b/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -deleted file mode 100644 -index 96a4906..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"iter_loss": 7.142178058624268, "_runtime": 14, "_timestamp": 1649500289, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -deleted file mode 100644 -index 2dc7db1..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log -+++ /dev/null -@@ -1,66 +0,0 @@ --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,660 DEBUG MainThread:6109 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 DEBUG SenderThread:6109 [sender.py:send():179] send: header --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,673 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:01:15,673 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:15,970 DEBUG SenderThread:6109 [sender.py:send():179] send: run --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:16,736 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():39] meta init --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:__init__():53] meta init done --2022-04-09 16:01:16,739 DEBUG HandlerThread:6109 [meta.py:probe():210] probe --2022-04-09 16:01:16,745 DEBUG HandlerThread:6109 [meta.py:_setup_git():200] setup git --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_setup_git():207] setup git done --2022-04-09 16:01:16,762 DEBUG HandlerThread:6109 [meta.py:_save_code():89] save code --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_code():110] save code done --2022-04-09 16:01:16,769 DEBUG HandlerThread:6109 [meta.py:_save_patches():127] save patches --2022-04-09 16:01:16,811 DEBUG HandlerThread:6109 [meta.py:_save_patches():169] save patches done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():57] save pip --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_pip():71] save pip done --2022-04-09 16:01:16,812 DEBUG HandlerThread:6109 [meta.py:_save_conda():78] save conda --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:_save_conda():86] save conda done --2022-04-09 16:01:18,148 DEBUG HandlerThread:6109 [meta.py:probe():252] probe done --2022-04-09 16:01:18,150 DEBUG SenderThread:6109 [sender.py:send():179] send: files --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,158 DEBUG HandlerThread:6109 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:01:18,158 DEBUG SenderThread:6109 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,709 DEBUG SenderThread:6109 [sender.py:send():179] send: config --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: history --2022-04-09 16:01:29,848 DEBUG SenderThread:6109 [sender.py:send():179] send: summary --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log b/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -deleted file mode 100644 -index 87f5666..0000000 ---- a/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log -+++ /dev/null -@@ -1,49 +0,0 @@ --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:01:15,631 INFO MainThread:6109 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug.log --2022-04-09 16:01:15,632 INFO MainThread:6109 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/logs/debug-internal.log --2022-04-09 16:01:15,633 INFO MainThread:6109 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:15,634 INFO MainThread:6109 [wandb_init.py:init():418] starting backend --2022-04-09 16:01:15,655 INFO MainThread:6109 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:01:15,656 INFO MainThread:6109 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:01:15,658 INFO wandb_internal:6109 [internal.py:wandb_internal():91] W&B internal server running at pid: 6109, started at: 2022-04-09 16:01:15.656065 --2022-04-09 16:01:15,659 INFO MainThread:6109 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:01:15,670 INFO MainThread:6109 [wandb_init.py:init():484] communicating current version --2022-04-09 16:01:15,672 INFO WriterThread:6109 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:01:15,966 INFO MainThread:6109 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:01:16,733 INFO SenderThread:6109 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files --2022-04-09 16:01:16,734 INFO SenderThread:6109 [sender.py:_start_run_threads():707] run started: yr1wk5mi with start time 1649500275 --2022-04-09 16:01:16,735 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:16,736 INFO MainThread:6109 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/requirements.txt --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/diff.patch --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code/train_translation.py --2022-04-09 16:01:17,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/code --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:01:18,150 INFO SenderThread:6109 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:01:18,151 INFO SenderThread:6109 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:01:18,160 INFO MainThread:6109 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:01:18,162 INFO MainThread:6109 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:01:18,163 INFO MainThread:6109 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:01:18,164 INFO MainThread:6109 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/conda-environment.yaml --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-metadata.json --2022-04-09 16:01:18,734 INFO Thread-11 :6109 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:19,843 INFO Thread-14 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/3aqderx8-wandb-metadata.json --2022-04-09 16:01:19,846 INFO Thread-15 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/4nx7fbcb-code/train_translation.py --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:20,735 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/config.yaml --2022-04-09 16:01:20,845 INFO Thread-18 :6109 [upload_job.py:push():133] Uploaded file /tmp/tmpguz2ugxewandb/35j9ij83-diff.patch --2022-04-09 16:01:22,918 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:24,920 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/output.log --2022-04-09 16:01:29,851 INFO SenderThread:6109 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:01:29,923 INFO Thread-11 :6109 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160115-yr1wk5mi/files/wandb-summary.json -diff --git a/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb b/wandb/run-20220409_160115-yr1wk5mi/run-yr1wk5mi.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py b/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -deleted file mode 100644 -index feaf1fc..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py -+++ /dev/null -@@ -1,377 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- wandb.log({"epoch_loss":epoch_loss}) -- if args.rank == 0: -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml b/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch b/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -deleted file mode 100644 -index eec0ab3..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch -+++ /dev/null -@@ -1,459 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..8b42533 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,182 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..feaf1fc 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,97 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ wandb.log({"epoch_loss":epoch_loss}) --+ if args.rank == 0: --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..e712296 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b2fc627 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160246-2bmbfqcy/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..337b531 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160246-2bmbfqcy --\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/output.log b/wandb/run-20220409_160246-2bmbfqcy/files/output.log -deleted file mode 100644 -index e15e9a4..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/output.log -+++ /dev/null -@@ -1,17 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt b/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -deleted file mode 100644 -index f4efc7b..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:32:47.190940", -- "startedAt": "2022-04-09T10:32:46.030719", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json b/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -deleted file mode 100644 -index 59ceedf..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 18, "_timestamp": 1649500384, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -deleted file mode 100644 -index 4dae842..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log -+++ /dev/null -@@ -1,68 +0,0 @@ --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,040 DEBUG MainThread:6410 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send():179] send: header --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,043 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:46,151 DEBUG SenderThread:6410 [sender.py:send():179] send: run --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:47,188 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():39] meta init --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:__init__():53] meta init done --2022-04-09 16:02:47,190 DEBUG HandlerThread:6410 [meta.py:probe():210] probe --2022-04-09 16:02:47,197 DEBUG HandlerThread:6410 [meta.py:_setup_git():200] setup git --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_setup_git():207] setup git done --2022-04-09 16:02:47,216 DEBUG HandlerThread:6410 [meta.py:_save_code():89] save code --2022-04-09 16:02:47,224 DEBUG HandlerThread:6410 [meta.py:_save_code():110] save code done --2022-04-09 16:02:47,225 DEBUG HandlerThread:6410 [meta.py:_save_patches():127] save patches --2022-04-09 16:02:47,270 DEBUG HandlerThread:6410 [meta.py:_save_patches():169] save patches done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():57] save pip --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_pip():71] save pip done --2022-04-09 16:02:47,271 DEBUG HandlerThread:6410 [meta.py:_save_conda():78] save conda --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:_save_conda():86] save conda done --2022-04-09 16:02:48,637 DEBUG HandlerThread:6410 [meta.py:probe():252] probe done --2022-04-09 16:02:48,639 DEBUG SenderThread:6410 [sender.py:send():179] send: files --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,649 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:49,267 DEBUG SenderThread:6410 [sender.py:send():179] send: config --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,268 DEBUG HandlerThread:6410 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:03:04,269 DEBUG SenderThread:6410 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:03:04,791 DEBUG SenderThread:6410 [sender.py:send():179] send: history --2022-04-09 16:03:04,792 DEBUG SenderThread:6410 [sender.py:send():179] send: summary --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log b/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -deleted file mode 100644 -index c4edd31..0000000 ---- a/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log -+++ /dev/null -@@ -1,48 +0,0 @@ --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug.log --2022-04-09 16:02:46,031 INFO MainThread:6410 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/logs/debug-internal.log --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:46,032 INFO MainThread:6410 [wandb_init.py:init():418] starting backend --2022-04-09 16:02:46,037 INFO MainThread:6410 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:02:46,038 INFO wandb_internal:6410 [internal.py:wandb_internal():91] W&B internal server running at pid: 6410, started at: 2022-04-09 16:02:46.037354 --2022-04-09 16:02:46,038 INFO MainThread:6410 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:02:46,039 INFO MainThread:6410 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:02:46,042 INFO MainThread:6410 [wandb_init.py:init():484] communicating current version --2022-04-09 16:02:46,043 INFO WriterThread:6410 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb --2022-04-09 16:02:46,147 INFO MainThread:6410 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:02:46,148 INFO MainThread:6410 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:02:47,185 INFO SenderThread:6410 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files --2022-04-09 16:02:47,185 INFO SenderThread:6410 [sender.py:_start_run_threads():707] run started: 2bmbfqcy with start time 1649500366 --2022-04-09 16:02:47,187 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:02:47,188 INFO MainThread:6410 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:02:48,186 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-summary.json --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/diff.patch --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/requirements.txt --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code/train_translation.py --2022-04-09 16:02:48,187 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/code --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:02:48,639 INFO SenderThread:6410 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:02:48,640 INFO SenderThread:6410 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:02:48,649 INFO MainThread:6410 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:02:48,651 INFO MainThread:6410 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:02:48,653 INFO MainThread:6410 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:02:49,195 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/conda-environment.yaml --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/wandb-metadata.json --2022-04-09 16:02:49,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:50,751 INFO Thread-16 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/8jmqqlw3-diff.patch --2022-04-09 16:02:50,752 INFO Thread-14 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/162ca126-wandb-metadata.json --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/config.yaml --2022-04-09 16:02:51,196 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:02:51,759 INFO Thread-15 :6410 [upload_job.py:push():133] Uploaded file /tmp/tmphv1ed_ldwandb/19onurwq-code/train_translation.py --2022-04-09 16:02:55,197 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:03,207 INFO Thread-11 :6410 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160246-2bmbfqcy/files/output.log --2022-04-09 16:03:04,798 INFO SenderThread:6410 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -diff --git a/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb b/wandb/run-20220409_160246-2bmbfqcy/run-2bmbfqcy.wandb -deleted file mode 100644 -index e69de29..0000000 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py b/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -deleted file mode 100644 -index 182fd97..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py -+++ /dev/null -@@ -1,378 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml b/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -deleted file mode 100644 -index a0e0750..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch b/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -deleted file mode 100644 -index 2c51f6a..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch -+++ /dev/null -@@ -1,470 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..507a499 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,192 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..182fd97 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,98 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..2224b92 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..94d02b9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160409-1qxpwcwj/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f7361e5 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160409-1qxpwcwj --\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/output.log b/wandb/run-20220409_160409-1qxpwcwj/files/output.log -deleted file mode 100644 -index 35bceac..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/output.log -+++ /dev/null -@@ -1,18 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt b/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -deleted file mode 100644 -index 440569b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:34:10.122598", -- "startedAt": "2022-04-09T10:34:09.149412", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json b/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -deleted file mode 100644 -index 52da06b..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 4649.924870014191, "_runtime": 27, "_timestamp": 1649500476, "_step": 0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -deleted file mode 100644 -index bf89eff..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log -+++ /dev/null -@@ -1,78 +0,0 @@ --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,159 DEBUG MainThread:6703 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send():179] send: header --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,163 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:09,250 DEBUG SenderThread:6703 [sender.py:send():179] send: run --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:10,119 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():39] meta init --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:__init__():53] meta init done --2022-04-09 16:04:10,122 DEBUG HandlerThread:6703 [meta.py:probe():210] probe --2022-04-09 16:04:10,130 DEBUG HandlerThread:6703 [meta.py:_setup_git():200] setup git --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_setup_git():207] setup git done --2022-04-09 16:04:10,195 DEBUG HandlerThread:6703 [meta.py:_save_code():89] save code --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_code():110] save code done --2022-04-09 16:04:10,211 DEBUG HandlerThread:6703 [meta.py:_save_patches():127] save patches --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_patches():169] save patches done --2022-04-09 16:04:10,306 DEBUG HandlerThread:6703 [meta.py:_save_pip():57] save pip --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_pip():71] save pip done --2022-04-09 16:04:10,307 DEBUG HandlerThread:6703 [meta.py:_save_conda():78] save conda --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:_save_conda():86] save conda done --2022-04-09 16:04:11,657 DEBUG HandlerThread:6703 [meta.py:probe():252] probe done --2022-04-09 16:04:11,658 DEBUG SenderThread:6703 [sender.py:send():179] send: files --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,667 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:11,669 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:12,396 DEBUG SenderThread:6703 [sender.py:send():179] send: config --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:27,397 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:27,397 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: history --2022-04-09 16:04:36,357 DEBUG SenderThread:6703 [sender.py:send():179] send: summary --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:39,168 DEBUG SenderThread:6703 [sender.py:send():179] send: stats --2022-04-09 16:04:44,241 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:44,241 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:59,736 DEBUG HandlerThread:6703 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:04:59,737 DEBUG SenderThread:6703 [sender.py:send_request():193] send_request: stop_status -diff --git a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log b/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -deleted file mode 100644 -index 0fbab81..0000000 ---- a/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log -+++ /dev/null -@@ -1,54 +0,0 @@ --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/logs/debug-internal.log --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:09,151 INFO MainThread:6703 [wandb_init.py:init():418] starting backend --2022-04-09 16:04:09,156 INFO MainThread:6703 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:04:09,157 INFO MainThread:6703 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:04:09,158 INFO wandb_internal:6703 [internal.py:wandb_internal():91] W&B internal server running at pid: 6703, started at: 2022-04-09 16:04:09.157143 --2022-04-09 16:04:09,159 INFO MainThread:6703 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:04:09,162 INFO MainThread:6703 [wandb_init.py:init():484] communicating current version --2022-04-09 16:04:09,163 INFO WriterThread:6703 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:04:09,248 INFO MainThread:6703 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:04:10,116 INFO SenderThread:6703 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files --2022-04-09 16:04:10,116 INFO SenderThread:6703 [sender.py:_start_run_threads():707] run started: 1qxpwcwj with start time 1649500449 --2022-04-09 16:04:10,118 INFO MainThread:6703 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:04:10,119 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/requirements.txt --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/diff.patch --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code/train_translation.py --2022-04-09 16:04:11,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/code --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:04:11,659 INFO SenderThread:6703 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:04:11,667 INFO MainThread:6703 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:04:11,670 INFO MainThread:6703 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:04:11,671 INFO MainThread:6703 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:04:11,672 INFO MainThread:6703 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/conda-environment.yaml --2022-04-09 16:04:12,117 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-metadata.json --2022-04-09 16:04:12,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:14,579 INFO Thread-18 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2jyc5la6-diff.patch --2022-04-09 16:04:15,118 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/config.yaml --2022-04-09 16:04:16,480 INFO Thread-14 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/a1u633fb-wandb-metadata.json --2022-04-09 16:04:16,597 INFO Thread-15 :6703 [upload_job.py:push():133] Uploaded file /tmp/tmp_q1pzmhpwandb/2s2yhxd4-code/train_translation.py --2022-04-09 16:04:18,121 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:26,125 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:28,126 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:34,128 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,129 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:36,357 INFO SenderThread:6703 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:04:37,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/wandb-summary.json --2022-04-09 16:04:38,334 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log --2022-04-09 16:04:50,337 INFO Thread-11 :6703 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160409-1qxpwcwj/files/output.log -diff --git a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb b/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb -deleted file mode 100644 -index 81c67b9..0000000 -Binary files a/wandb/run-20220409_160409-1qxpwcwj/run-1qxpwcwj.wandb and /dev/null differ -diff --git a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py b/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml b/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_160908-2097uoqw/files/config.yaml b/wandb/run-20220409_160908-2097uoqw/files/config.yaml -deleted file mode 100644 -index 1ebd7db..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_160908-2097uoqw/files/diff.patch b/wandb/run-20220409_160908-2097uoqw/files/diff.patch -deleted file mode 100644 -index 9c4e2ae..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/diff.patch -+++ /dev/null -@@ -1,482 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2d0dffc 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,202 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..18dd535 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..b8703a2 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_160908-2097uoqw/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7af087b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_160908-2097uoqw --\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/output.log b/wandb/run-20220409_160908-2097uoqw/files/output.log -deleted file mode 100644 -index ed7c7b5..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --load 0 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt b/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -deleted file mode 100644 -index 3cf53b0..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json -+++ /dev/null -@@ -1,27 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:39:09.049034", -- "startedAt": "2022-04-09T10:39:08.174640", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json b/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -deleted file mode 100644 -index 225791e..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5264.9873046875, "_runtime": 162, "_timestamp": 1649500910, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log b/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -deleted file mode 100644 -index 1baf812..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log -+++ /dev/null -@@ -1,1238 +0,0 @@ --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,183 DEBUG MainThread:7244 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 DEBUG SenderThread:7244 [sender.py:send():179] send: header --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,187 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:09:08,187 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:08,556 DEBUG SenderThread:7244 [sender.py:send():179] send: run --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:09,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():39] meta init --2022-04-09 16:09:09,048 DEBUG HandlerThread:7244 [meta.py:__init__():53] meta init done --2022-04-09 16:09:09,049 DEBUG HandlerThread:7244 [meta.py:probe():210] probe --2022-04-09 16:09:09,055 DEBUG HandlerThread:7244 [meta.py:_setup_git():200] setup git --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_setup_git():207] setup git done --2022-04-09 16:09:09,071 DEBUG HandlerThread:7244 [meta.py:_save_code():89] save code --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_code():110] save code done --2022-04-09 16:09:09,078 DEBUG HandlerThread:7244 [meta.py:_save_patches():127] save patches --2022-04-09 16:09:09,148 DEBUG HandlerThread:7244 [meta.py:_save_patches():169] save patches done --2022-04-09 16:09:09,149 DEBUG HandlerThread:7244 [meta.py:_save_pip():57] save pip --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_pip():71] save pip done --2022-04-09 16:09:09,150 DEBUG HandlerThread:7244 [meta.py:_save_conda():78] save conda --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:_save_conda():86] save conda done --2022-04-09 16:09:10,558 DEBUG HandlerThread:7244 [meta.py:probe():252] probe done --2022-04-09 16:09:10,559 DEBUG SenderThread:7244 [sender.py:send():179] send: files --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,033 DEBUG SenderThread:7244 [sender.py:send():179] send: config --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:09:24,796 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:26,037 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:26,037 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:37,780 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:41,491 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:41,492 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:09:56,929 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:09:56,929 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:07,915 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:08,466 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:10:12,367 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:12,368 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:10:15,825 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:27,818 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:27,818 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:43,478 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:43,478 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:10:58,974 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:10:58,974 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,373 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:05,374 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:08,654 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:14,750 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:14,750 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:28,251 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:32,169 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:32,169 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:39,457 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:48,462 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:11:48,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: history --2022-04-09 16:11:50,289 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:03,967 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:12:03,968 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw --2022-04-09 16:12:05,938 INFO MainThread:7244 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 16:12:05,939 INFO MainThread:7244 [wandb_run.py:_restore():1480] restore --2022-04-09 16:12:06,150 DEBUG SenderThread:7244 [sender.py:send():179] send: telemetry --2022-04-09 16:12:06,151 DEBUG SenderThread:7244 [sender.py:send():179] send: exit --2022-04-09 16:12:06,151 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 16:12:06,152 INFO SenderThread:7244 [sender.py:send_exit():295] send defer --2022-04-09 16:12:06,153 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:06,155 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,155 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 16:12:06,155 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:06,156 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 16:12:06,157 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 16:12:06,158 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,158 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 16:12:06,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 16:12:06,226 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send():179] send: stats --2022-04-09 16:12:06,227 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,227 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 16:12:06,227 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 16:12:06,227 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 16:12:06,228 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,228 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send():179] send: summary --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:12:06,228 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 16:12:06,228 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 16:12:06,229 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:06,229 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 16:12:06,229 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:06,229 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 16:12:06,259 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:06,450 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:06,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:07,230 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 16:12:07,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,231 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,231 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 16:12:07,231 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 40095 --} -- --2022-04-09 16:12:07,232 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,232 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 16:12:07,232 INFO SenderThread:7244 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:12:07,333 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:07,451 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:07,453 INFO SenderThread:7244 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt requirements.txt --2022-04-09 16:12:07,454 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:12:07,455 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log output.log --2022-04-09 16:12:07,456 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:12:07,457 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json wandb-summary.json --2022-04-09 16:12:07,467 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml config.yaml --2022-04-09 16:12:07,468 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch diff.patch --2022-04-09 16:12:07,507 INFO SenderThread:7244 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py code/train_translation.py --2022-04-09 16:12:07,507 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 16:12:07,508 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:07,510 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,510 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 16:12:07,510 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 40095 -- total_bytes: 50723 --} -- --2022-04-09 16:12:07,511 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 16:12:07,511 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:07,511 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 16:12:07,512 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:07,512 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 16:12:07,512 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:07,513 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 16:12:07,612 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,484 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 16:12:08,485 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,486 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,486 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 16:12:08,487 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 16:12:08,487 INFO SenderThread:7244 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 16:12:08,487 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41552 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,489 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: defer --2022-04-09 16:12:08,489 DEBUG SenderThread:7244 [sender.py:send():179] send: final --2022-04-09 16:12:08,490 INFO HandlerThread:7244 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send():179] send: footer --2022-04-09 16:12:08,490 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: defer --2022-04-09 16:12:08,490 INFO SenderThread:7244 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 16:12:08,591 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,591 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,593 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,695 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,695 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,696 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,798 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,798 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,799 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:08,848 INFO Thread-33 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:12:08,900 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:08,901 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:08,902 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 41657 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,004 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,005 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,006 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,108 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,109 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,110 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,212 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,213 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,214 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,316 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,317 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,318 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,420 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,421 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,422 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,524 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,525 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,526 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,628 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,629 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,630 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,732 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,733 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,734 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 42867 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,837 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,838 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,840 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:09,875 INFO Thread-32 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:12:09,942 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:09,942 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:09,944 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,046 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,046 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,047 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,149 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,150 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,151 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,253 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,254 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,255 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,304 INFO Thread-29 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:12:10,357 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,358 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,359 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,461 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,462 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,463 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,772 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,772 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,772 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,874 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,874 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,876 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:10,978 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:10,979 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:10,980 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,082 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,082 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,084 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,186 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,186 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,188 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,290 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,290 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,292 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,314 INFO Thread-30 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:11,394 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,394 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,396 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,498 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,499 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,500 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,602 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,603 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,604 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,706 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,707 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,708 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,810 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,810 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,812 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:11,914 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:11,915 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:11,916 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,018 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,019 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,020 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,122 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,122 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,124 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,226 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,226 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,228 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,330 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,330 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,332 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,434 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,435 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,436 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,538 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,538 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,540 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,642 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,642 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,644 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,746 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,746 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,747 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,850 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,850 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,852 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:12,954 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:12,954 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:12,955 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,057 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,058 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,059 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,161 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,162 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,163 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,265 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,266 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,267 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,369 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,370 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,371 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,473 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,473 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,475 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,577 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,577 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,578 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,680 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,681 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,682 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,784 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,785 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,786 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,888 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,889 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,890 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:13,992 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:13,993 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:13,994 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,096 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,097 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,098 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,200 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,201 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,202 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,304 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,305 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,307 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,409 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,410 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,411 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,513 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,514 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,515 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,617 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,618 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,619 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,721 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,721 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,723 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,826 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,827 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,829 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:14,931 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:14,931 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:14,933 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,034 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,035 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,037 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,138 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,139 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,141 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,244 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,244 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,245 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,348 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,348 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,350 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,453 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,454 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,461 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,565 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,566 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,567 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,669 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,669 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,671 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,773 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,773 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,775 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,877 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,877 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,879 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:15,981 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:15,982 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:15,983 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 47441 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,085 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,086 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,087 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,189 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,190 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,191 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,293 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,294 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,295 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,397 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,398 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,399 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,501 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,502 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,503 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,605 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,606 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,607 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,709 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,710 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,711 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,813 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,814 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,816 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:16,918 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:16,919 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:16,920 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,022 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,023 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,024 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,126 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,127 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,128 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,230 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,230 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,232 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,334 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,335 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,336 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,374 INFO Thread-31 :7244 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:12:17,438 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,438 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,440 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,542 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,543 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,544 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,646 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 16:12:17,647 DEBUG SenderThread:7244 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 16:12:17,647 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:17,648 INFO MainThread:7244 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 50723 -- total_bytes: 50723 --} -- --2022-04-09 16:12:17,650 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 16:12:17,653 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 16:12:17,656 DEBUG HandlerThread:7244 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 16:12:17,656 INFO HandlerThread:7244 [handler.py:finish():638] shutting down handler --2022-04-09 16:12:18,493 INFO WriterThread:7244 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:12:18,647 INFO SenderThread:7244 [sender.py:finish():933] shutting down sender --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:12:18,648 INFO SenderThread:7244 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:12:18,661 INFO MainThread:7244 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 16:12:18,662 INFO MainThread:7244 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 16:12:18,663 INFO MainThread:7244 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 16:12:18,709 INFO MainThread:7244 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_160908-2097uoqw/logs/debug.log b/wandb/run-20220409_160908-2097uoqw/logs/debug.log -deleted file mode 100644 -index ad8f755..0000000 ---- a/wandb/run-20220409_160908-2097uoqw/logs/debug.log -+++ /dev/null -@@ -1,77 +0,0 @@ --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug.log --2022-04-09 16:09:08,175 INFO MainThread:7244 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/logs/debug-internal.log --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:08,176 INFO MainThread:7244 [wandb_init.py:init():418] starting backend --2022-04-09 16:09:08,180 INFO MainThread:7244 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:09:08,181 INFO wandb_internal:7244 [internal.py:wandb_internal():91] W&B internal server running at pid: 7244, started at: 2022-04-09 16:09:08.181261 --2022-04-09 16:09:08,182 INFO MainThread:7244 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:09:08,183 INFO MainThread:7244 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:09:08,186 INFO MainThread:7244 [wandb_init.py:init():484] communicating current version --2022-04-09 16:09:08,186 INFO WriterThread:7244 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:09:08,555 INFO MainThread:7244 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:09:09,044 INFO SenderThread:7244 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files --2022-04-09 16:09:09,044 INFO SenderThread:7244 [sender.py:_start_run_threads():707] run started: 2097uoqw with start time 1649500748 --2022-04-09 16:09:09,045 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:09,046 INFO MainThread:7244 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/diff.patch --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:10,046 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/requirements.txt --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code/train_translation.py --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:10,047 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/code --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:09:10,560 INFO SenderThread:7244 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:09:10,561 INFO SenderThread:7244 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:09:10,566 INFO MainThread:7244 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:09:10,570 INFO MainThread:7244 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:09:10,574 INFO MainThread:7244 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:09:11,076 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/conda-environment.yaml --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-metadata.json --2022-04-09 16:09:11,080 INFO Thread-11 :7244 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:12,541 INFO Thread-14 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/59p33rsf-wandb-metadata.json --2022-04-09 16:09:12,542 INFO Thread-22 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/1s3licml-diff.patch --2022-04-09 16:09:12,543 INFO Thread-17 :7244 [upload_job.py:push():133] Uploaded file /tmp/tmpaa9c8yvlwandb/g430jhga-code/train_translation.py --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/config.yaml --2022-04-09 16:09:13,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:15,070 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:17,071 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:23,074 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:24,796 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:09:25,075 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:09:39,079 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:07,924 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:08,089 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:13,091 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:15,825 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:10:16,092 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:10:17,093 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:10:29,096 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:03,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,105 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:05,374 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:06,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:07,393 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:21,397 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:27,410 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:28,296 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:28,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:29,411 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:43,415 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:47,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:49,437 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:11:50,291 INFO SenderThread:7244 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:11:50,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/wandb-summary.json --2022-04-09 16:11:51,438 INFO Thread-11 :7244 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_160908-2097uoqw/files/output.log --2022-04-09 16:12:05,937 INFO MainThread:7244 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/2097uoqw -diff --git a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb b/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb -deleted file mode 100644 -index b5995f1..0000000 -Binary files a/wandb/run-20220409_160908-2097uoqw/run-2097uoqw.wandb and /dev/null differ -diff --git a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py b/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -deleted file mode 100644 -index 529add4..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py -+++ /dev/null -@@ -1,380 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- if args.rank == 0: -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -diff --git a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml b/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_161421-3t82t88x/files/config.yaml b/wandb/run-20220409_161421-3t82t88x/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_161421-3t82t88x/files/diff.patch b/wandb/run-20220409_161421-3t82t88x/files/diff.patch -deleted file mode 100644 -index aa6c773..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/diff.patch -+++ /dev/null -@@ -1,528 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..2aaecf9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,248 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..529add4 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,100 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ if args.rank == 0: --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..91bb884 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..252e468 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_161421-3t82t88x/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..c99b343 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_161421-3t82t88x --\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/output.log b/wandb/run-20220409_161421-3t82t88x/files/output.log -deleted file mode 100644 -index 3bf650b..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/output.log -+++ /dev/null -@@ -1,67 +0,0 @@ -- --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Exception in thread Thread-15: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") --Exception: The wandb backend process has shutdown --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt b/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -deleted file mode 100644 -index f9df6f1..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:44:23.094487", -- "startedAt": "2022-04-09T10:44:21.821617", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json b/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log b/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -deleted file mode 100644 -index 3f70132..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,831 DEBUG MainThread:8815 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send():179] send: header --2022-04-09 16:14:21,835 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:21,939 DEBUG SenderThread:8815 [sender.py:send():179] send: run --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,090 DEBUG SenderThread:8815 [sender.py:send():179] send: summary --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:23,092 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():39] meta init --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:__init__():53] meta init done --2022-04-09 16:14:23,094 DEBUG HandlerThread:8815 [meta.py:probe():210] probe --2022-04-09 16:14:23,100 DEBUG HandlerThread:8815 [meta.py:_setup_git():200] setup git --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_setup_git():207] setup git done --2022-04-09 16:14:23,122 DEBUG HandlerThread:8815 [meta.py:_save_code():89] save code --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_code():110] save code done --2022-04-09 16:14:23,133 DEBUG HandlerThread:8815 [meta.py:_save_patches():127] save patches --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_patches():169] save patches done --2022-04-09 16:14:23,196 DEBUG HandlerThread:8815 [meta.py:_save_pip():57] save pip --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_pip():71] save pip done --2022-04-09 16:14:23,197 DEBUG HandlerThread:8815 [meta.py:_save_conda():78] save conda --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,537 DEBUG HandlerThread:8815 [meta.py:_save_conda():86] save conda done --2022-04-09 16:14:24,538 DEBUG HandlerThread:8815 [meta.py:probe():252] probe done --2022-04-09 16:14:24,539 DEBUG SenderThread:8815 [sender.py:send():179] send: files --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,548 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:24,548 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:25,577 DEBUG SenderThread:8815 [sender.py:send():179] send: config --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:40,579 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:40,579 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:14:51,743 DEBUG SenderThread:8815 [sender.py:send():179] send: stats --2022-04-09 16:14:56,424 DEBUG HandlerThread:8815 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:14:56,424 DEBUG SenderThread:8815 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:15:01,820 DEBUG SenderThread:8815 [sender.py:send():179] send: history --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/logs/debug.log b/wandb/run-20220409_161421-3t82t88x/logs/debug.log -deleted file mode 100644 -index 99b6b97..0000000 ---- a/wandb/run-20220409_161421-3t82t88x/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug.log --2022-04-09 16:14:21,822 INFO MainThread:8815 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/logs/debug-internal.log --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:21,823 INFO MainThread:8815 [wandb_init.py:init():418] starting backend --2022-04-09 16:14:21,828 INFO MainThread:8815 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:14:21,829 INFO wandb_internal:8815 [internal.py:wandb_internal():91] W&B internal server running at pid: 8815, started at: 2022-04-09 16:14:21.828726 --2022-04-09 16:14:21,829 INFO MainThread:8815 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:14:21,830 INFO MainThread:8815 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:14:21,834 INFO MainThread:8815 [wandb_init.py:init():484] communicating current version --2022-04-09 16:14:21,835 INFO WriterThread:8815 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:14:21,935 INFO MainThread:8815 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:14:21,936 INFO MainThread:8815 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:14:23,089 INFO SenderThread:8815 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:14:23,089 INFO SenderThread:8815 [sender.py:_start_run_threads():707] run started: 3t82t88x with start time 1649501061 --2022-04-09 16:14:23,091 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:14:23,091 INFO MainThread:8815 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py --2022-04-09 16:14:24,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:14:24,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code --2022-04-09 16:14:24,539 INFO SenderThread:8815 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:14:24,540 INFO SenderThread:8815 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:14:24,541 INFO SenderThread:8815 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:14:24,547 INFO MainThread:8815 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:14:24,551 INFO MainThread:8815 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:14:24,552 INFO MainThread:8815 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:14:24,553 INFO MainThread:8815 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:14:25,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json --2022-04-09 16:14:25,093 INFO Thread-11 :8815 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:26,654 INFO Thread-14 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1woflnrf-wandb-metadata.json --2022-04-09 16:14:26,655 INFO Thread-17 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/2g34m9v2-code/train_translation.py --2022-04-09 16:14:27,090 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:27,669 INFO Thread-18 :8815 [upload_job.py:push():133] Uploaded file /tmp/tmpyo0egpl2wandb/1gwzitp2-diff.patch --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:14:29,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:14:31,091 INFO Thread-11 :8815 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:01,820 INFO WriterThread:8815 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb --2022-04-09 16:15:01,820 INFO SenderThread:8815 [sender.py:finish():933] shutting down sender --2022-04-09 16:15:01,821 INFO SenderThread:8815 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:15:02,097 INFO SenderThread:8815 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files --2022-04-09 16:15:02,098 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt requirements.txt --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:15:02,099 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log output.log --2022-04-09 16:15:02,120 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:15:02,121 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json wandb-summary.json --2022-04-09 16:15:02,142 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml config.yaml --2022-04-09 16:15:02,153 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/diff.patch diff.patch --2022-04-09 16:15:02,165 INFO SenderThread:8815 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/code/train_translation.py code/train_translation.py --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:15:02,166 INFO SenderThread:8815 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:15:04,027 INFO Thread-25 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/output.log --2022-04-09 16:15:04,029 INFO Thread-27 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/wandb-summary.json --2022-04-09 16:15:04,030 INFO Thread-24 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/requirements.txt --2022-04-09 16:15:04,034 INFO Thread-26 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/conda-environment.yaml --2022-04-09 16:15:04,036 INFO Thread-28 :8815 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_161421-3t82t88x/files/config.yaml --2022-04-09 16:15:05,015 ERROR wandb_internal:8815 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 16:24:49,089 INFO MainThread:8815 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 16:24:49,090 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,379 INFO MainThread:8815 [wandb_run.py:_restore():1480] restore --2022-04-09 16:24:49,381 INFO MainThread:8815 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb b/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb -deleted file mode 100644 -index a4486ce..0000000 -Binary files a/wandb/run-20220409_161421-3t82t88x/run-3t82t88x.wandb and /dev/null differ -diff --git a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py b/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml b/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_162621-m83puhmm/files/config.yaml b/wandb/run-20220409_162621-m83puhmm/files/config.yaml -deleted file mode 100644 -index f0ae705..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 1 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_162621-m83puhmm/files/diff.patch b/wandb/run-20220409_162621-m83puhmm/files/diff.patch -deleted file mode 100644 -index 9eddab1..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/diff.patch -+++ /dev/null -@@ -1,560 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..353da1f 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,249 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..f0332eb 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..97853e9 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_162621-m83puhmm/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..7be71e2 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_162621-m83puhmm --\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/output.log b/wandb/run-20220409_162621-m83puhmm/files/output.log -deleted file mode 100644 -index ee1c9e3..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/output.log -+++ /dev/null -@@ -1,52 +0,0 @@ -- --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --train_translation.py --load 0 --test_translation 1 --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --test_bleu_score 0.0 --Exception in thread Thread-6: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --wandb: ERROR Internal wandb error: file data was not synced --Traceback (most recent call last): -- File "", line 1, in -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main -- exitcode = _main(fd) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/spawn.py", line 118, in _main -- return self._bootstrap() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/process.py", line 315, in _bootstrap -- threading._shutdown() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 1307, in _shutdown -- lock.acquire() --KeyboardInterrupt -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt b/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -deleted file mode 100644 -index 4ce8f76..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json -+++ /dev/null -@@ -1,29 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T10:56:22.902051", -- "startedAt": "2022-04-09T10:56:21.924771", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--load", -- "0", -- "--test_translation", -- "1" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json b/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -deleted file mode 100644 -index 9e26dfe..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{} -\ No newline at end of file -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log b/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -deleted file mode 100644 -index 7032449..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log -+++ /dev/null -@@ -1,107 +0,0 @@ --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,934 DEBUG MainThread:9280 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 16:26:21,937 DEBUG SenderThread:9280 [sender.py:send():179] send: header --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:21,938 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: check_version --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,344 DEBUG SenderThread:9280 [sender.py:send():179] send: run --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,889 DEBUG SenderThread:9280 [sender.py:send():179] send: summary --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:22,895 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():39] meta init --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:__init__():53] meta init done --2022-04-09 16:26:22,901 DEBUG HandlerThread:9280 [meta.py:probe():210] probe --2022-04-09 16:26:22,908 DEBUG HandlerThread:9280 [meta.py:_setup_git():200] setup git --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_setup_git():207] setup git done --2022-04-09 16:26:22,953 DEBUG HandlerThread:9280 [meta.py:_save_code():89] save code --2022-04-09 16:26:22,972 DEBUG HandlerThread:9280 [meta.py:_save_code():110] save code done --2022-04-09 16:26:22,973 DEBUG HandlerThread:9280 [meta.py:_save_patches():127] save patches --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_patches():169] save patches done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():57] save pip --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_pip():71] save pip done --2022-04-09 16:26:23,081 DEBUG HandlerThread:9280 [meta.py:_save_conda():78] save conda --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:_save_conda():86] save conda done --2022-04-09 16:26:24,438 DEBUG HandlerThread:9280 [meta.py:probe():252] probe done --2022-04-09 16:26:24,440 DEBUG SenderThread:9280 [sender.py:send():179] send: files --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:24,448 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:24,898 DEBUG SenderThread:9280 [sender.py:send():179] send: config --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:39,905 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:39,905 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:26:51,624 DEBUG SenderThread:9280 [sender.py:send():179] send: stats --2022-04-09 16:26:55,340 DEBUG HandlerThread:9280 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 16:26:55,340 DEBUG SenderThread:9280 [sender.py:send_request():193] send_request: stop_status --2022-04-09 16:27:06,912 DEBUG SenderThread:9280 [sender.py:send():179] send: history --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/logs/debug.log b/wandb/run-20220409_162621-m83puhmm/logs/debug.log -deleted file mode 100644 -index 5053427..0000000 ---- a/wandb/run-20220409_162621-m83puhmm/logs/debug.log -+++ /dev/null -@@ -1,85 +0,0 @@ --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 16:26:21,925 INFO MainThread:9280 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/logs/debug-internal.log --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():369] calling init triggers --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:21,926 INFO MainThread:9280 [wandb_init.py:init():418] starting backend --2022-04-09 16:26:21,931 INFO MainThread:9280 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 16:26:21,932 INFO wandb_internal:9280 [internal.py:wandb_internal():91] W&B internal server running at pid: 9280, started at: 2022-04-09 16:26:21.931687 --2022-04-09 16:26:21,932 INFO MainThread:9280 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 16:26:21,934 INFO MainThread:9280 [wandb_init.py:init():423] backend started and connected --2022-04-09 16:26:21,936 INFO MainThread:9280 [wandb_init.py:init():465] updated telemetry --2022-04-09 16:26:21,937 INFO MainThread:9280 [wandb_init.py:init():484] communicating current version --2022-04-09 16:26:21,937 INFO WriterThread:9280 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:26:22,343 INFO MainThread:9280 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 16:26:22,344 INFO MainThread:9280 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 16:26:22,884 INFO SenderThread:9280 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:26:22,885 INFO SenderThread:9280 [sender.py:_start_run_threads():707] run started: m83puhmm with start time 1649501781 --2022-04-09 16:26:22,890 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 16:26:22,893 INFO MainThread:9280 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:26:23,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py --2022-04-09 16:26:23,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code --2022-04-09 16:26:24,440 INFO SenderThread:9280 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 16:26:24,441 INFO SenderThread:9280 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 16:26:24,442 INFO SenderThread:9280 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 16:26:24,448 INFO MainThread:9280 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 16:26:24,450 INFO MainThread:9280 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 16:26:24,451 INFO MainThread:9280 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 16:26:24,454 INFO MainThread:9280 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 1, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 16:26:24,885 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json --2022-04-09 16:26:24,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:25,823 INFO Thread-17 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/xb2dntmc-code/train_translation.py --2022-04-09 16:26:25,824 INFO Thread-14 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/a41a1xzf-wandb-metadata.json --2022-04-09 16:26:26,830 INFO Thread-22 :9280 [upload_job.py:push():133] Uploaded file /tmp/tmpnmpgmtujwandb/3ttad6f8-diff.patch --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:26:26,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:28,886 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:26:30,887 INFO Thread-11 :9280 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:06,912 INFO SenderThread:9280 [sender.py:finish():933] shutting down sender --2022-04-09 16:27:06,913 INFO SenderThread:9280 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files --2022-04-09 16:27:07,894 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt requirements.txt --2022-04-09 16:27:07,895 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-metadata.json wandb-metadata.json --2022-04-09 16:27:07,896 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log output.log --2022-04-09 16:27:07,903 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml conda-environment.yaml --2022-04-09 16:27:07,904 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json wandb-summary.json --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml config.yaml --2022-04-09 16:27:07,905 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/diff.patch diff.patch --2022-04-09 16:27:07,908 INFO SenderThread:9280 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/code/train_translation.py code/train_translation.py --2022-04-09 16:27:07,909 INFO SenderThread:9280 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 16:27:07,910 INFO SenderThread:9280 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 16:27:07,912 INFO WriterThread:9280 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb --2022-04-09 16:27:09,044 INFO Thread-25 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/output.log --2022-04-09 16:27:09,053 INFO Thread-26 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/conda-environment.yaml --2022-04-09 16:27:09,056 INFO Thread-24 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/requirements.txt --2022-04-09 16:27:09,061 INFO Thread-27 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/wandb-summary.json --2022-04-09 16:27:09,079 INFO Thread-28 :9280 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_162621-m83puhmm/files/config.yaml --2022-04-09 16:27:09,727 ERROR wandb_internal:9280 [internal.py:wandb_internal():159] Thread HandlerThread: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 55, in run -- self._run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/internal/internal_util.py", line 102, in _run -- record = self._input_record_q.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 --2022-04-09 17:37:10,785 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,969 INFO MainThread:9280 [wandb_run.py:_restore():1480] restore --2022-04-09 17:37:10,971 INFO MainThread:9280 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb b/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb -deleted file mode 100644 -index 978cbe5..0000000 -Binary files a/wandb/run-20220409_162621-m83puhmm/run-m83puhmm.wandb and /dev/null differ -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py b/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml b/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -deleted file mode 100644 -index 1988ff1..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 4 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 256 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 5 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 1 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 3 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 1 -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch b/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -deleted file mode 100644 -index d503875..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch -+++ /dev/null -@@ -1,561 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..b0966e9 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,250 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..1486dd6 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..071678f 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_173901-1dj6b5jf/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..be8b91a 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_173901-1dj6b5jf --\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/output.log b/wandb/run-20220409_173901-1dj6b5jf/files/output.log -deleted file mode 100644 -index f4f17d5..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/output.log -+++ /dev/null -@@ -1,59 +0,0 @@ -- --train_translation.py --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --translation model saved in checkpoint --{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --translation model saved in checkpoint --{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --translation model saved in checkpoint --{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt b/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -deleted file mode 100644 -index 6c00633..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json -+++ /dev/null -@@ -1,24 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:09:01.944494", -- "startedAt": "2022-04-09T12:09:01.199712", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json b/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -deleted file mode 100644 -index c0804b4..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 5045.823547363281, "_runtime": 154, "_timestamp": 1649506295, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -deleted file mode 100644 -index 67f5897..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log -+++ /dev/null -@@ -1,418 +0,0 @@ --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,208 DEBUG MainThread:10760 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send():179] send: header --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,212 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,337 DEBUG SenderThread:10760 [sender.py:send():179] send: run --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:01,942 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():39] meta init --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:__init__():53] meta init done --2022-04-09 17:39:01,944 DEBUG HandlerThread:10760 [meta.py:probe():210] probe --2022-04-09 17:39:01,950 DEBUG HandlerThread:10760 [meta.py:_setup_git():200] setup git --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_setup_git():207] setup git done --2022-04-09 17:39:01,967 DEBUG HandlerThread:10760 [meta.py:_save_code():89] save code --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_code():110] save code done --2022-04-09 17:39:01,975 DEBUG HandlerThread:10760 [meta.py:_save_patches():127] save patches --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_patches():169] save patches done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():57] save pip --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_pip():71] save pip done --2022-04-09 17:39:02,020 DEBUG HandlerThread:10760 [meta.py:_save_conda():78] save conda --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:_save_conda():86] save conda done --2022-04-09 17:39:03,360 DEBUG HandlerThread:10760 [meta.py:probe():252] probe done --2022-04-09 17:39:03,362 DEBUG SenderThread:10760 [sender.py:send():179] send: files --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,372 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:03,372 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,822 DEBUG SenderThread:10760 [sender.py:send():179] send: config --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:39:16,267 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:18,825 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:18,826 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:30,755 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:39:34,298 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:34,298 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:39:49,766 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:39:49,766 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:01,384 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:05,203 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:05,204 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:20,708 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,724 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:20,725 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,136 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:27,137 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:32,273 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:40:36,248 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:36,249 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:40:47,641 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:51,681 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:40:51,682 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:02,941 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,142 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:07,142 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:07,869 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:22,870 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:22,871 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:33,728 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: history --2022-04-09 17:41:35,959 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,321 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:41:38,322 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:_restore():1480] restore --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: telemetry --2022-04-09 17:41:51,002 DEBUG SenderThread:10760 [sender.py:send():179] send: exit --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 17:41:51,003 INFO SenderThread:10760 [sender.py:send_exit():295] send defer --2022-04-09 17:41:51,004 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,005 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,006 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,006 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 17:41:51,007 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,008 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,008 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 17:41:51,009 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 17:41:51,009 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,010 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 17:41:51,062 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,062 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send():179] send: stats --2022-04-09 17:41:51,063 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,063 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 17:41:51,063 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,063 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 17:41:51,064 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,064 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 17:41:51,064 DEBUG SenderThread:10760 [sender.py:send():179] send: summary --2022-04-09 17:41:51,064 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 17:41:51,065 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,065 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 17:41:51,065 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,065 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 17:41:51,109 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:51,203 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:51,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:51,546 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 17:41:51,546 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:51,546 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:51,546 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 44166 --} -- --2022-04-09 17:41:51,546 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 17:41:51,547 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:51,547 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 17:41:51,547 INFO SenderThread:10760 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 17:41:51,648 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,204 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:41:52,206 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt requirements.txt --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json wandb-metadata.json --2022-04-09 17:41:52,207 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log output.log --2022-04-09 17:41:52,208 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml conda-environment.yaml --2022-04-09 17:41:52,209 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json wandb-summary.json --2022-04-09 17:41:52,218 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml config.yaml --2022-04-09 17:41:52,220 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch diff.patch --2022-04-09 17:41:52,222 INFO SenderThread:10760 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py code/train_translation.py --2022-04-09 17:41:52,224 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 17:41:52,224 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 17:41:52,225 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 17:41:52,225 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:52,225 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 17:41:52,225 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,225 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 17:41:52,225 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,226 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,226 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 17:41:52,328 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,842 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 17:41:52,842 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,844 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,844 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 17:41:52,845 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:52,846 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 17:41:52,846 INFO SenderThread:10760 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 17:41:52,848 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: defer --2022-04-09 17:41:52,848 DEBUG SenderThread:10760 [sender.py:send():179] send: final --2022-04-09 17:41:52,849 INFO HandlerThread:10760 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 17:41:52,849 DEBUG SenderThread:10760 [sender.py:send():179] send: footer --2022-04-09 17:41:52,850 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: defer --2022-04-09 17:41:52,850 INFO SenderThread:10760 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 17:41:52,947 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:52,947 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:52,948 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 44166 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,049 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,050 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,051 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 45730 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,153 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,153 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,155 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,256 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,257 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,258 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,360 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,361 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,362 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,464 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,465 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,466 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,502 INFO Thread-33 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:41:53,504 INFO Thread-29 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:41:53,512 INFO Thread-32 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:53,524 INFO Thread-31 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:41:53,568 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,568 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,569 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,671 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,672 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,673 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,775 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,776 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,777 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,879 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,879 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,881 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:53,983 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:53,983 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:53,984 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,033 INFO Thread-30 :10760 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:54,086 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,087 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,088 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,190 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,190 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,192 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,294 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 17:41:54,294 DEBUG SenderThread:10760 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 17:41:54,294 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:54,295 INFO MainThread:10760 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 54768 -- total_bytes: 54768 --} -- --2022-04-09 17:41:54,297 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 17:41:54,299 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 17:41:54,302 DEBUG HandlerThread:10760 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 17:41:54,302 INFO HandlerThread:10760 [handler.py:finish():638] shutting down handler --2022-04-09 17:41:54,849 INFO WriterThread:10760 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:41:55,295 INFO SenderThread:10760 [sender.py:finish():933] shutting down sender --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 17:41:55,295 INFO SenderThread:10760 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 17:41:55,308 INFO MainThread:10760 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 17:41:55,309 INFO MainThread:10760 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 17:41:55,310 INFO MainThread:10760 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 17:41:55,323 INFO MainThread:10760 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log b/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -deleted file mode 100644 -index 2ea4289..0000000 ---- a/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log -+++ /dev/null -@@ -1,73 +0,0 @@ --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug.log --2022-04-09 17:39:01,200 INFO MainThread:10760 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/logs/debug-internal.log --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():376] wandb.init called with sweep_config: {} --config: {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:01,201 INFO MainThread:10760 [wandb_init.py:init():418] starting backend --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:39:01,206 INFO MainThread:10760 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:39:01,207 INFO wandb_internal:10760 [internal.py:wandb_internal():91] W&B internal server running at pid: 10760, started at: 2022-04-09 17:39:01.206592 --2022-04-09 17:39:01,208 INFO MainThread:10760 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:39:01,211 INFO MainThread:10760 [wandb_init.py:init():484] communicating current version --2022-04-09 17:39:01,212 INFO WriterThread:10760 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:39:01,333 INFO MainThread:10760 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:39:01,939 INFO SenderThread:10760 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files --2022-04-09 17:39:01,939 INFO SenderThread:10760 [sender.py:_start_run_threads():707] run started: 1dj6b5jf with start time 1649506141 --2022-04-09 17:39:01,941 INFO MainThread:10760 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:39:01,941 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/requirements.txt --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code/train_translation.py --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/diff.patch --2022-04-09 17:39:02,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/code --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:39:03,362 INFO SenderThread:10760 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:39:03,363 INFO SenderThread:10760 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:39:03,372 INFO MainThread:10760 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:39:03,374 INFO MainThread:10760 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:39:03,375 INFO MainThread:10760 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:39:03,376 INFO MainThread:10760 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 5, 'batch_size': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 256, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 1, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 1} --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/conda-environment.yaml --2022-04-09 17:39:03,940 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-metadata.json --2022-04-09 17:39:03,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:04,556 INFO Thread-14 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/2bsevvzq-wandb-metadata.json --2022-04-09 17:39:04,570 INFO Thread-15 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/229pqnc8-code/train_translation.py --2022-04-09 17:39:05,340 INFO Thread-17 :10760 [upload_job.py:push():133] Uploaded file /tmp/tmpfwfmk75uwandb/1kcug5yp-diff.patch --2022-04-09 17:39:05,941 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/config.yaml --2022-04-09 17:39:05,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:07,942 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:09,943 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:15,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:16,268 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:39:16,945 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:39:17,946 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:39:29,954 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:20,709 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:20,973 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:27,137 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:28,142 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:44,154 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:47,642 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:40:48,158 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:40:50,160 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:04,169 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:07,869 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:08,170 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:10,171 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:32,187 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:35,960 INFO SenderThread:10760 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/wandb-summary.json --2022-04-09 17:41:36,192 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:38,194 INFO Thread-11 :10760 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_173901-1dj6b5jf/files/output.log --2022-04-09 17:41:50,823 INFO MainThread:10760 [wandb_run.py:finish():1208] finishing run tera_squid/translation_test/1dj6b5jf -diff --git a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb b/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb -deleted file mode 100644 -index c939775..0000000 -Binary files a/wandb/run-20220409_173901-1dj6b5jf/run-1dj6b5jf.wandb and /dev/null differ -diff --git a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py b/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml b/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_175151-z44hpswp/files/config.yaml b/wandb/run-20220409_175151-z44hpswp/files/config.yaml -deleted file mode 100644 -index 0b2ef04..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 128 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 24 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_175151-z44hpswp/files/diff.patch b/wandb/run-20220409_175151-z44hpswp/files/diff.patch -deleted file mode 100644 -index a6f8b6d..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/diff.patch -+++ /dev/null -@@ -1,634 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e11eb21 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,302 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..a3e7597 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..453b7bc 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_175151-z44hpswp/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..b2d6ded 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_175151-z44hpswp --\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/output.log b/wandb/run-20220409_175151-z44hpswp/files/output.log -deleted file mode 100644 -index 2224687..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/output.log -+++ /dev/null -@@ -1,48 +0,0 @@ -- --train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --translation model saved in checkpoint --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --translation model saved in checkpoint --translation model saved in checkpoint --{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --translation model saved in checkpoint --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt b/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -deleted file mode 100644 -index e3bc5e0..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:21:52.829321", -- "startedAt": "2022-04-09T12:21:51.786614", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=128", -- "--dfeedforward=1024", -- "--epochs=24", -- "--nhead=4", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json b/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -deleted file mode 100644 -index 4d8b4c3..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 107.22583770751953, "_runtime": 695, "_timestamp": 1649507606, "_step": 28, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log b/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -deleted file mode 100644 -index 552d2f2..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log -+++ /dev/null -@@ -1,620 +0,0 @@ --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,796 DEBUG MainThread:14720 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send():179] send: header --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:51,800 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: check_version --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,171 DEBUG SenderThread:14720 [sender.py:send():179] send: run --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,825 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:52,827 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():39] meta init --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:__init__():53] meta init done --2022-04-09 17:51:52,829 DEBUG HandlerThread:14720 [meta.py:probe():210] probe --2022-04-09 17:51:52,837 DEBUG HandlerThread:14720 [meta.py:_setup_git():200] setup git --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_setup_git():207] setup git done --2022-04-09 17:51:52,869 DEBUG HandlerThread:14720 [meta.py:_save_code():89] save code --2022-04-09 17:51:52,876 DEBUG HandlerThread:14720 [meta.py:_save_code():110] save code done --2022-04-09 17:51:52,877 DEBUG HandlerThread:14720 [meta.py:_save_patches():127] save patches --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_patches():169] save patches done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():57] save pip --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_pip():71] save pip done --2022-04-09 17:51:52,928 DEBUG HandlerThread:14720 [meta.py:_save_conda():78] save conda --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:_save_conda():86] save conda done --2022-04-09 17:51:54,259 DEBUG HandlerThread:14720 [meta.py:probe():252] probe done --2022-04-09 17:51:54,261 DEBUG SenderThread:14720 [sender.py:send():179] send: files --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,272 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:51:54,272 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,720 DEBUG SenderThread:14720 [sender.py:send():179] send: config --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:06,575 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:09,721 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:09,721 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:21,569 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:25,148 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:25,149 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:40,576 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:40,576 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:49,874 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:52,213 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:52:55,651 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,140 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:52:56,140 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:11,596 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:11,597 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:14,741 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:23,054 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:27,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:27,074 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:38,173 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:42,499 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:42,500 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:53,596 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:57,929 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:53:57,929 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:53:59,413 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:53:59,414 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:13,359 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:13,359 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,344 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:54:20,345 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:24,527 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:28,793 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:28,793 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:44,227 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:44,227 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:54:55,062 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:54:59,653 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:54:59,653 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:11,338 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:11,339 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:15,098 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:15,099 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:17,278 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:25,911 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:30,519 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:30,519 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:55:37,281 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:45,955 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:55:45,956 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:55:56,468 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:01,086 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:01,589 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:17,078 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:17,078 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:23,379 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:27,343 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:56:32,522 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:32,522 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:56:46,540 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:47,961 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:56:47,961 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:56:57,925 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:03,390 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:03,390 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:18,853 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:18,853 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:28,552 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:57:34,280 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:34,280 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:39,211 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:57:45,145 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:49,734 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:57:49,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:57:59,325 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,341 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:05,342 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:05,789 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:20,790 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:20,790 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:29,955 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:30,176 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:36,214 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:36,214 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:51,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:58:51,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:58:52,209 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:00,845 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:07,147 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:07,147 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 17:59:13,797 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:22,588 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:22,588 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:31,435 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:38,008 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:38,008 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 17:59:53,449 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 17:59:53,450 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:02,140 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:07,706 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:08,884 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:08,884 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:13,617 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:13,618 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:24,366 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:24,367 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:32,786 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:00:36,584 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:39,806 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:39,806 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,224 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:00:55,225 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,715 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:00,716 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:03,610 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:10,649 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:10,649 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:22,153 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:26,073 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:26,073 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:34,217 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:41,491 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:41,492 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,993 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:01:43,994 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:56,918 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:01:56,918 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:04,763 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:12,340 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:12,340 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:27,774 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:27,774 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:35,408 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:38,748 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:43,201 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:43,201 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:44,434 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:02:44,435 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:58,647 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:02:58,647 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:03,720 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:06,291 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:14,117 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:14,117 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,051 DEBUG SenderThread:14720 [sender.py:send():179] send: history --2022-04-09 18:03:26,052 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:29,557 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:29,559 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:36,939 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:03:42,324 INFO MainThread:14720 [wandb_run.py:_restore():1480] restore --2022-04-09 18:03:43,079 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:43,080 DEBUG SenderThread:14720 [sender.py:send():179] send: telemetry --2022-04-09 18:03:43,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:43,580 DEBUG SenderThread:14720 [sender.py:send():179] send: exit --2022-04-09 18:03:43,580 INFO SenderThread:14720 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:03:43,581 INFO SenderThread:14720 [sender.py:send_exit():295] send defer --2022-04-09 18:03:43,581 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:43,582 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,583 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:03:43,583 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:03:43,584 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:03:43,584 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 48639 --} -- --2022-04-09 18:03:43,585 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,586 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:03:43,657 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,657 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send():179] send: stats --2022-04-09 18:03:43,658 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,658 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:03:43,658 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:03:43,658 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send():179] send: summary --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:43,659 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:03:43,659 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:03:43,659 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,659 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:03:43,660 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:43,660 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:03:43,660 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:43,660 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:03:43,660 INFO SenderThread:14720 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:03:43,686 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:44,248 INFO SenderThread:14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt requirements.txt --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log output.log --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json wandb-summary.json --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml config.yaml --2022-04-09 18:03:44,249 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch diff.patch --2022-04-09 18:03:44,251 INFO SenderThread:14720 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py code/train_translation.py --2022-04-09 18:03:44,253 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:03:44,253 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,254 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,258 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:03:44,260 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,260 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:03:44,260 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:44,260 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:03:44,261 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,261 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:03:44,261 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,261 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:03:44,361 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:44,907 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:03:44,908 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:44,908 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,908 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:03:44,909 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 48639 -- total_bytes: 58315 --} -- --2022-04-09 18:03:44,909 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:03:44,909 INFO SenderThread:14720 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:03:44,910 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:03:44,910 INFO HandlerThread:14720 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: final --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send():179] send: footer --2022-04-09 18:03:44,911 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: defer --2022-04-09 18:03:44,911 INFO SenderThread:14720 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:03:45,010 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,011 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,012 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,115 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,116 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,117 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,219 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,219 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,221 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,323 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,323 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,325 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,427 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,427 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,428 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,466 INFO Thread-54 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 18:03:45,472 INFO Thread-52 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 18:03:45,476 INFO Thread-53 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:45,530 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,531 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,532 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,634 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,635 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,636 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,738 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,739 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,740 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,842 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,842 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,844 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:45,946 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:45,946 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:45,948 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,050 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,051 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,053 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,155 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,156 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,157 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,184 INFO Thread-56 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 18:03:46,188 INFO Thread-55 :14720 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:46,259 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,259 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,261 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,363 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,364 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,365 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,468 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:03:46,469 DEBUG SenderThread:14720 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:03:46,469 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:46,470 INFO MainThread:14720 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 58315 -- total_bytes: 58315 --} -- --2022-04-09 18:03:46,472 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:03:46,474 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:03:46,477 DEBUG HandlerThread:14720 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:03:46,478 INFO HandlerThread:14720 [handler.py:finish():638] shutting down handler --2022-04-09 18:03:46,911 INFO WriterThread:14720 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 18:03:47,469 INFO SenderThread:14720 [sender.py:finish():933] shutting down sender --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:03:47,470 INFO SenderThread:14720 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:03:47,483 INFO MainThread:14720 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:03:47,484 INFO MainThread:14720 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:03:47,485 INFO MainThread:14720 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:03:47,525 INFO MainThread:14720 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_175151-z44hpswp/logs/debug.log b/wandb/run-20220409_175151-z44hpswp/logs/debug.log -deleted file mode 100644 -index bb769fe..0000000 ---- a/wandb/run-20220409_175151-z44hpswp/logs/debug.log -+++ /dev/null -@@ -1,140 +0,0 @@ --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'z44hpswp', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-z44hpswp.yaml', 'start_method': 'thread'} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug.log --2022-04-09 17:51:51,787 INFO MainThread:14720 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/logs/debug-internal.log --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():369] calling init triggers --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --config: {'workers': 4, 'epochs': 24, 'batch_size': 128, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:51,788 INFO MainThread:14720 [wandb_init.py:init():418] starting backend --2022-04-09 17:51:51,793 INFO MainThread:14720 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 17:51:51,794 INFO wandb_internal:14720 [internal.py:wandb_internal():91] W&B internal server running at pid: 14720, started at: 2022-04-09 17:51:51.793927 --2022-04-09 17:51:51,795 INFO MainThread:14720 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 17:51:51,796 INFO MainThread:14720 [wandb_init.py:init():423] backend started and connected --2022-04-09 17:51:51,797 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 128, 'dfeedforward': 1024, 'epochs': 24, 'nhead': 4, 'nlayers': 4} --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():465] updated telemetry --2022-04-09 17:51:51,799 INFO MainThread:14720 [wandb_init.py:init():484] communicating current version --2022-04-09 17:51:51,800 INFO WriterThread:14720 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb --2022-04-09 17:51:52,170 INFO MainThread:14720 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 17:51:52,171 INFO MainThread:14720 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 17:51:52,824 INFO SenderThread:14720 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files --2022-04-09 17:51:52,824 INFO SenderThread:14720 [sender.py:_start_run_threads():707] run started: z44hpswp with start time 1649506911 --2022-04-09 17:51:52,826 INFO MainThread:14720 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 17:51:52,826 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:51:53,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/diff.patch --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code/train_translation.py --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/requirements.txt --2022-04-09 17:51:53,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/code --2022-04-09 17:51:54,261 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 17:51:54,262 INFO SenderThread:14720 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 17:51:54,263 INFO SenderThread:14720 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 17:51:54,272 INFO MainThread:14720 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 17:51:54,274 INFO MainThread:14720 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 17:51:54,276 INFO MainThread:14720 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/conda-environment.yaml --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-metadata.json --2022-04-09 17:51:54,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:56,133 INFO Thread-15 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2ih8faqi-code/train_translation.py --2022-04-09 17:51:56,134 INFO Thread-14 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/hxttd0im-wandb-metadata.json --2022-04-09 17:51:56,135 INFO Thread-16 :14720 [upload_job.py:push():133] Uploaded file /tmp/tmp1e33tdlewandb/2f1e53ks-diff.patch --2022-04-09 17:51:56,825 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/config.yaml --2022-04-09 17:51:56,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:51:58,826 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:00,827 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:06,575 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:07,050 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:21,053 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:52:49,877 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:50,064 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:52:55,651 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:52:56,142 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:11,146 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:14,742 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:15,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:17,233 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:35,238 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:38,173 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:53:38,239 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:53:55,247 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:53:59,416 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:00,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:17,258 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:20,346 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:54:21,261 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:54:39,266 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:11,339 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:12,278 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:17,280 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:17,281 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:33,287 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:37,282 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:55:37,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:55:39,290 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:55:57,307 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:01,089 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:01,591 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:19,597 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:23,382 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:23,878 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:56:43,960 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:46,541 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:56:47,040 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:06,045 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:57:39,211 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:40,057 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:57:45,145 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:57:46,061 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:02,065 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:05,790 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:06,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:07,248 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:25,253 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:30,177 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:30,255 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:58:47,288 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:58:52,210 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:58:52,289 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:09,294 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:13,798 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 17:59:14,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 17:59:15,296 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 17:59:33,301 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:07,707 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:08,314 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:13,618 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:14,317 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:31,321 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:36,585 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:00:37,323 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:00:37,324 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:00:55,328 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:00,716 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:01,330 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:17,334 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:22,153 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:22,653 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:01:39,657 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:01:43,994 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:01:44,659 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:03,664 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:02:38,749 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:39,680 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:44,435 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:02:44,933 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:02:59,938 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:03,721 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:04,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:06,221 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:22,227 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/output.log --2022-04-09 18:03:26,052 INFO SenderThread:14720 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:26,231 INFO Thread-11 :14720 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_175151-z44hpswp/files/wandb-summary.json --2022-04-09 18:03:42,322 INFO MainThread:14720 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/z44hpswp -diff --git a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb b/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb -deleted file mode 100644 -index 55f1aff..0000000 -Binary files a/wandb/run-20220409_175151-z44hpswp/run-z44hpswp.wandb and /dev/null differ -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py b/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml b/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml b/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -deleted file mode 100644 -index 194d831..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/config.yaml -+++ /dev/null -@@ -1,109 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 2: -- - 1 -- - 11 -- 3: -- - 2 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 40 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 4 --nlayers: -- desc: null -- value: 6 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch b/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -deleted file mode 100644 -index 979dcc5..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/diff.patch -+++ /dev/null -@@ -1,645 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..42fbde8 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,313 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..371ace5 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..a6d9884 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_180353-vjrenr4z/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..705068b 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_180353-vjrenr4z --\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/output.log b/wandb/run-20220409_180353-vjrenr4z/files/output.log -deleted file mode 100644 -index a2bf91c..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/output.log -+++ /dev/null -@@ -1,102 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --translation model saved in checkpoint --{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --translation model saved in checkpoint --{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --translation model saved in checkpoint --{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --translation model saved in checkpoint --{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --translation model saved in checkpoint --{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --translation model saved in checkpoint --{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --translation model saved in checkpoint --{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --translation model saved in checkpoint --{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --translation model saved in checkpoint --{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --translation model saved in checkpoint --{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --translation model saved in checkpoint --{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --translation model saved in checkpoint --{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --translation model saved in checkpoint --{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --translation model saved in checkpoint --{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --translation model saved in checkpoint --{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --translation model saved in checkpoint --{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --translation model saved in checkpoint --{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --translation model saved in checkpoint --{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --translation model saved in checkpoint --{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --translation model saved in checkpoint --{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --translation model saved in checkpoint --{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --translation model saved in checkpoint --{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --translation model saved in checkpoint --{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --translation model saved in checkpoint --{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --translation model saved in checkpoint --{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --translation model saved in checkpoint --{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --translation model saved in checkpoint --{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --translation model saved in checkpoint --{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --translation model saved in checkpoint --{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --translation model saved in checkpoint --{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --translation model saved in checkpoint --{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --translation model saved in checkpoint --{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --translation model saved in checkpoint --{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --translation model saved in checkpoint --{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --translation model saved in checkpoint --{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --translation model saved in checkpoint -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt b/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -deleted file mode 100644 -index 3e24107..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:33:55.138080", -- "startedAt": "2022-04-09T12:33:53.912960", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=40", -- "--nhead=4", -- "--nlayers=6" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json b/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -deleted file mode 100644 -index dbd5bb9..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 571.8498382568359, "_runtime": 1394, "_timestamp": 1649509027, "_step": 47, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -deleted file mode 100644 -index 6ac5722..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log -+++ /dev/null -@@ -1,809 +0,0 @@ --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,947 DEBUG MainThread:18842 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 DEBUG SenderThread:18842 [sender.py:send():179] send: header --2022-04-09 18:03:53,957 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:53,958 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:54,487 DEBUG SenderThread:18842 [sender.py:send():179] send: run --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,124 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:55,130 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():39] meta init --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:__init__():53] meta init done --2022-04-09 18:03:55,137 DEBUG HandlerThread:18842 [meta.py:probe():210] probe --2022-04-09 18:03:55,146 DEBUG HandlerThread:18842 [meta.py:_setup_git():200] setup git --2022-04-09 18:03:55,213 DEBUG HandlerThread:18842 [meta.py:_setup_git():207] setup git done --2022-04-09 18:03:55,214 DEBUG HandlerThread:18842 [meta.py:_save_code():89] save code --2022-04-09 18:03:55,241 DEBUG HandlerThread:18842 [meta.py:_save_code():110] save code done --2022-04-09 18:03:55,242 DEBUG HandlerThread:18842 [meta.py:_save_patches():127] save patches --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_patches():169] save patches done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():57] save pip --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_pip():71] save pip done --2022-04-09 18:03:55,334 DEBUG HandlerThread:18842 [meta.py:_save_conda():78] save conda --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,710 DEBUG HandlerThread:18842 [meta.py:_save_conda():86] save conda done --2022-04-09 18:03:56,711 DEBUG HandlerThread:18842 [meta.py:probe():252] probe done --2022-04-09 18:03:56,713 DEBUG SenderThread:18842 [sender.py:send():179] send: files --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,723 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:03:56,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,196 DEBUG SenderThread:18842 [sender.py:send():179] send: config --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:04:09,890 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:12,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:12,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:23,959 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:27,637 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:27,637 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:43,070 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:43,071 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:04:54,578 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:04:58,609 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:04:58,609 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:13,418 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,096 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:14,096 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:19,610 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:25,318 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:05:29,536 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:29,536 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,041 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:05:45,042 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:05:45,711 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:55,878 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:00,385 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:00,385 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,115 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:12,116 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:15,812 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:15,812 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:26,509 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:06:31,252 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:31,252 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:06:39,204 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:46,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:06:46,699 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:06:57,088 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:02,128 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:02,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:07:07,189 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:17,560 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:17,560 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:27,788 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:33,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:33,039 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:48,472 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:07:48,472 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:07:58,460 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:03,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:03,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:10,495 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:10,496 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,773 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:16,774 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:19,358 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:19,358 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:29,127 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:08:34,827 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:34,827 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:08:43,393 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:50,258 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:08:50,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:08:59,791 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:05,625 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:05,625 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:09,196 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:21,079 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:21,079 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:30,544 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:36,425 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:36,426 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,629 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:09:37,630 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:51,758 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:09:51,758 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:01,192 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:10:06,067 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,213 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:07,213 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:22,576 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:22,576 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,752 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:10:37,928 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:37,928 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:10:53,268 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:10:53,268 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:02,406 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:08,610 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:08,610 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:12,361 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:18,663 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:23,966 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:23,966 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:33,001 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:39,600 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:39,600 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:11:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:54,944 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:11:54,944 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:03,627 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:10,280 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:10,280 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:12,130 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:25,635 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:25,635 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:34,297 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:12:36,014 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:40,989 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:40,989 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:56,322 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:12:56,323 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:00,307 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:05,226 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:11,687 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:11,687 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:27,035 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:27,035 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:35,749 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:13:42,474 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:42,475 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:13:57,111 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:13:57,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:03,217 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:06,507 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:13,240 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:13,240 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,985 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:26,986 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:28,667 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:28,668 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:37,148 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:14:44,310 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:44,310 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:14:53,107 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:59,666 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:14:59,666 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:07,695 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:14,998 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:14,998 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:17,525 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:30,334 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:30,334 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:38,429 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:15:44,460 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:45,673 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:15:45,673 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:01,020 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:01,020 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:09,031 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:16,349 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:16,349 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:31,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:31,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:39,689 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:46,381 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:47,261 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:16:47,261 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:16:52,591 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:02,605 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:02,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:10,351 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:16,742 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:17,935 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:17,935 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:33,308 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:33,308 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,998 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:17:44,097 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:17:44,098 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:48,657 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:17:48,817 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:04,733 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:04,733 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:10,263 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:11,869 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:20,065 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:20,065 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:35,442 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:35,442 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,258 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:18:42,271 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:50,780 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:18:50,780 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:06,176 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:06,176 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:12,884 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:21,533 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:21,533 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:36,872 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:36,872 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:41,320 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:43,542 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:19:47,487 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:52,222 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:19:52,222 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:07,575 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:07,575 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:11,295 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:14,395 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:22,919 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:22,920 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:38,284 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:38,284 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:20:39,161 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:44,947 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:20:53,719 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:20:53,719 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:05,165 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:09,154 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:09,154 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:15,554 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:24,513 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:24,513 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,048 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:21:32,049 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:39,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:39,921 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:21:46,176 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:55,292 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:21:55,292 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:10,678 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:10,679 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:16,761 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:26,337 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:26,337 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:37,631 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:41,696 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:41,696 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:22:43,842 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:22:43,843 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:47,574 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:22:57,038 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:22:57,038 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:06,284 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:12,473 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:12,473 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:18,151 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:27,820 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:27,820 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:23:37,389 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:43,266 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:43,266 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:48,907 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:23:58,729 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:23:58,729 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,447 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:03,448 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:14,167 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:14,167 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:19,591 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:29,519 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:29,520 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:24:31,880 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:44,877 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:24:44,877 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:24:50,128 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:00,259 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:00,259 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:15,606 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:15,606 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:20,792 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:30,948 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:30,948 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:32,468 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,976 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:25:38,977 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:46,374 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:25:46,374 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:25:51,548 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:01,722 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:01,723 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:03,261 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:03,262 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:17,072 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:17,072 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:22,124 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:26:32,410 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:32,411 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:26:38,163 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:47,810 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:26:47,810 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:26:52,753 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,241 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:03,241 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: history --2022-04-09 18:27:07,299 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:18,699 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:18,700 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:23,342 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:34,106 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:34,107 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z --2022-04-09 18:27:39,696 INFO MainThread:18842 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 0 --2022-04-09 18:27:39,697 INFO MainThread:18842 [wandb_run.py:_restore():1480] restore --2022-04-09 18:27:40,003 DEBUG SenderThread:18842 [sender.py:send():179] send: telemetry --2022-04-09 18:27:40,004 DEBUG SenderThread:18842 [sender.py:send():179] send: exit --2022-04-09 18:27:40,005 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,005 INFO SenderThread:18842 [sender.py:send_exit():287] handling exit code: 0 --2022-04-09 18:27:40,006 INFO SenderThread:18842 [sender.py:send_exit():295] send defer --2022-04-09 18:27:40,006 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,008 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,008 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 0 --2022-04-09 18:27:40,008 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,010 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 0 --2022-04-09 18:27:40,010 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 1 --2022-04-09 18:27:40,011 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,011 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 1 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 1 --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send():179] send: stats --2022-04-09 18:27:40,067 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,067 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 2 --2022-04-09 18:27:40,067 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,067 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 2 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 3 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 3 --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send():179] send: summary --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:40,068 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 3 --2022-04-09 18:27:40,068 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 4 --2022-04-09 18:27:40,068 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,068 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 4 --2022-04-09 18:27:40,069 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,069 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 4 --2022-04-09 18:27:40,110 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:40,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:40,461 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 5 --2022-04-09 18:27:40,462 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:40,463 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:40,464 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 5 --2022-04-09 18:27:40,464 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 2 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 49395 --} -- --2022-04-09 18:27:40,465 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:40,465 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 5 --2022-04-09 18:27:40,466 INFO SenderThread:18842 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:27:40,566 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,201 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:41,202 INFO SenderThread:18842 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:27:41,205 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt requirements.txt --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log output.log --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:27:41,206 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json wandb-summary.json --2022-04-09 18:27:41,207 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml config.yaml --2022-04-09 18:27:41,211 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch diff.patch --2022-04-09 18:27:41,220 INFO SenderThread:18842 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py code/train_translation.py --2022-04-09 18:27:41,223 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 6 --2022-04-09 18:27:41,224 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,225 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,225 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 6 --2022-04-09 18:27:41,225 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 49395 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,226 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,226 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 6 --2022-04-09 18:27:41,230 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:41,231 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 7 --2022-04-09 18:27:41,232 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,232 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 7 --2022-04-09 18:27:41,232 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,232 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 7 --2022-04-09 18:27:41,332 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:41,915 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 8 --2022-04-09 18:27:41,915 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:41,917 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,917 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 8 --2022-04-09 18:27:41,918 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:41,919 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 8 --2022-04-09 18:27:41,919 INFO SenderThread:18842 [sender.py:send_request_defer():342] send defer: 9 --2022-04-09 18:27:41,921 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: defer --2022-04-09 18:27:41,921 INFO HandlerThread:18842 [handler.py:handle_request_defer():141] handle defer: 9 --2022-04-09 18:27:41,921 DEBUG SenderThread:18842 [sender.py:send():179] send: final --2022-04-09 18:27:41,922 DEBUG SenderThread:18842 [sender.py:send():179] send: footer --2022-04-09 18:27:41,923 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: defer --2022-04-09 18:27:41,923 INFO SenderThread:18842 [sender.py:send_request_defer():304] handle sender defer: 9 --2022-04-09 18:27:42,024 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,024 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,025 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,127 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,128 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,129 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,231 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,231 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,233 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,335 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,335 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,336 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,438 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,439 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,440 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,542 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,542 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,544 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,592 INFO Thread-73 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:27:42,594 INFO Thread-71 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:27:42,599 INFO Thread-75 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:27:42,601 INFO Thread-72 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:42,602 INFO Thread-74 :18842 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:42,645 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,645 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,646 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,747 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,748 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,749 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,851 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: poll_exit --2022-04-09 18:27:42,851 DEBUG SenderThread:18842 [sender.py:send_request():193] send_request: poll_exit --2022-04-09 18:27:42,852 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:42,853 INFO MainThread:18842 [wandb_run.py:_wait_for_finish():1630] got exit ret: done: true --exit_result { --} --file_counts { -- wandb_count: 7 -- other_count: 1 --} --pusher_stats { -- uploaded_bytes: 62216 -- total_bytes: 62216 --} -- --2022-04-09 18:27:42,855 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: get_summary --2022-04-09 18:27:42,857 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: sampled_history --2022-04-09 18:27:42,860 DEBUG HandlerThread:18842 [handler.py:handle_request():124] handle_request: shutdown --2022-04-09 18:27:42,861 INFO HandlerThread:18842 [handler.py:finish():638] shutting down handler --2022-04-09 18:27:42,922 INFO WriterThread:18842 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:27:43,852 INFO SenderThread:18842 [sender.py:finish():933] shutting down sender --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:27:43,853 INFO SenderThread:18842 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_summary():1785] rendering summary --2022-04-09 18:27:43,866 INFO MainThread:18842 [wandb_run.py:_show_history():1823] rendering history --2022-04-09 18:27:43,868 INFO MainThread:18842 [wandb_run.py:_show_files():1852] logging synced files --2022-04-09 18:27:43,884 INFO MainThread:18842 [internal.py:handle_exit():78] Internal process exited -diff --git a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log b/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -deleted file mode 100644 -index 55b000f..0000000 ---- a/wandb/run-20220409_180353-vjrenr4z/logs/debug.log -+++ /dev/null -@@ -1,230 +0,0 @@ --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'vjrenr4z', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml', 'start_method': 'thread'} --2022-04-09 18:03:53,918 INFO MainThread:18842 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug.log --2022-04-09 18:03:53,919 INFO MainThread:18842 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/logs/debug-internal.log --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:03:53,920 INFO MainThread:18842 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --config: {'workers': 4, 'epochs': 40, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 1024, 'nlayers': 6, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:53,921 INFO MainThread:18842 [wandb_init.py:init():418] starting backend --2022-04-09 18:03:53,941 INFO MainThread:18842 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:03:53,943 INFO MainThread:18842 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:03:53,945 INFO wandb_internal:18842 [internal.py:wandb_internal():91] W&B internal server running at pid: 18842, started at: 2022-04-09 18:03:53.943037 --2022-04-09 18:03:53,947 INFO MainThread:18842 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:03:53,950 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 40, 'nhead': 4, 'nlayers': 6} --2022-04-09 18:03:53,955 INFO MainThread:18842 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:03:53,956 INFO MainThread:18842 [wandb_init.py:init():484] communicating current version --2022-04-09 18:03:53,957 INFO WriterThread:18842 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb --2022-04-09 18:03:54,486 INFO MainThread:18842 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:03:54,487 INFO MainThread:18842 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:03:55,116 INFO SenderThread:18842 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files --2022-04-09 18:03:55,117 INFO SenderThread:18842 [sender.py:_start_run_threads():707] run started: vjrenr4z with start time 1649507633 --2022-04-09 18:03:55,128 INFO MainThread:18842 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:03:55,129 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/diff.patch --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/requirements.txt --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code/train_translation.py --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:03:56,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/code --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:03:56,713 INFO SenderThread:18842 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:03:56,714 INFO SenderThread:18842 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:03:56,723 INFO MainThread:18842 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:03:56,725 INFO MainThread:18842 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:03:56,726 INFO MainThread:18842 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:03:56,727 INFO MainThread:18842 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/conda-environment.yaml --2022-04-09 18:03:57,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:57,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-metadata.json --2022-04-09 18:03:57,913 INFO Thread-14 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/3wu5f9t3-wandb-metadata.json --2022-04-09 18:03:57,923 INFO Thread-16 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/2smukmpq-diff.patch --2022-04-09 18:03:57,930 INFO Thread-15 :18842 [upload_job.py:push():133] Uploaded file /tmp/tmpzmoqkqw7wandb/371w3hlh-code/train_translation.py --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:03:59,117 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/config.yaml --2022-04-09 18:04:01,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:03,118 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:09,891 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:04:10,122 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:04:11,123 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:04:29,127 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:13,420 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:14,143 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:19,611 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:20,217 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:21,219 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:41,224 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:05:45,712 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:05:46,334 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:05:47,336 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:07,341 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:12,116 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:12,343 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:06:13,344 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:35,351 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,205 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:06:39,374 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:03,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,190 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:07:07,380 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:07,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:07:09,381 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:07:29,386 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:10,500 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:11,402 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:16,774 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:17,405 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:08:37,410 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,394 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:08:43,412 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:05,419 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,197 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:09,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:09:33,430 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:09:37,630 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:09:38,434 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:01,440 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:05,442 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:06,067 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:10:06,682 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:10:07,683 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:10:31,689 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:12,362 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:12,703 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:18,664 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:18,705 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:19,707 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:37,712 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:11:41,922 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:11:42,714 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:11:43,715 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:07,721 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:11,723 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:12,130 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:12,734 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:31,739 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:35,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:36,015 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:12:36,741 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:12:55,746 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:12:59,748 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:00,307 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:00,912 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:13:01,913 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:21,919 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:13:57,112 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:13:57,932 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:03,218 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:03,934 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:21,939 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:26,986 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:27,945 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:14:47,950 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,108 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:14:53,953 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:13,958 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:17,526 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:18,140 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:40,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:15:44,461 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:15:45,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:15:46,147 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:06,158 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:16:46,382 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:47,176 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:52,592 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:16:53,194 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:16:54,197 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:12,202 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:16,743 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:17,346 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:17:18,348 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:40,354 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,098 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:17:44,357 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:06,364 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,264 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:10,365 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:38,376 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,271 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:18:42,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:18:44,377 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:04,383 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:41,321 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:41,396 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:19:47,488 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:19:48,401 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:06,406 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:11,296 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:11,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:12,408 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:34,414 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:20:39,162 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:20:39,416 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:20:40,417 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:00,422 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:04,424 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:05,166 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:05,425 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:26,433 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,050 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:21:32,675 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:21:54,681 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:37,631 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:37,700 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:22:43,843 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:22:44,765 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:22:44,766 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:02,770 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,284 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:06,892 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:32,899 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:37,389 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:23:38,007 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:23:39,009 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:23:59,017 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,019 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:03,448 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:04,073 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:27,080 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:31,880 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:24:32,082 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:24:33,083 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:24:53,088 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:32,469 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:33,103 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:38,977 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:25:39,145 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:25:41,146 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:25:59,152 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:03,262 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:04,154 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:05,155 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:33,162 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:26:38,164 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:26:38,225 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:26:39,168 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:03,173 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,175 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/output.log --2022-04-09 18:27:07,300 INFO SenderThread:18842 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:08,179 INFO Thread-11 :18842 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_180353-vjrenr4z/files/wandb-summary.json --2022-04-09 18:27:39,695 INFO MainThread:18842 [wandb_run.py:finish():1208] finishing run tera_squid/context_enhancement/vjrenr4z -diff --git a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb b/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb -deleted file mode 100644 -index 2a205f7..0000000 -Binary files a/wandb/run-20220409_180353-vjrenr4z/run-vjrenr4z.wandb and /dev/null differ -diff --git a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py b/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -deleted file mode 100644 -index f284015..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/code/train_translation.py -+++ /dev/null -@@ -1,382 +0,0 @@ --import numpy as np --from pathlib import Path --import argparse --import json --import math --import os --import random --import signal --import subprocess --import sys --import time -- --import torch --from torch import nn, optim --from torch.nn import Transformer --import torchtext --import t_dataset --from t_dataset import Translation_dataset_t --from t_dataset import MyCollate --import translation_utils --from translation_utils import TokenEmbedding, PositionalEncoding --from translation_utils import create_mask --from transformers import BertModel --from transformers import AutoTokenizer --from torch import Tensor --from torchtext.data.metrics import bleu_score --from models import Translator --from models import BarlowTwins -- --import wandb -- -- --#import barlow --os.environ['TRANSFORMERS_OFFLINE'] = 'yes' --os.environ['WANDB_START_METHOD'] = 'thread' -- --MANUAL_SEED = 4444 -- --random.seed(MANUAL_SEED) --np.random.seed(MANUAL_SEED) --torch.manual_seed(MANUAL_SEED) --torch.backends.cudnn.deterministic = True -- -- --parser = argparse.ArgumentParser(description = 'Translation') -- --# Training hyper-parameters: --parser.add_argument('--workers', default=4, type=int, metavar='N', -- help='number of data loader workers') --parser.add_argument('--epochs', default=5, type=int, metavar='N', -- help='number of total epochs to run') --parser.add_argument('--batch_size', default=4, type=int, metavar='n', -- help='mini-batch size') --parser.add_argument('--learning-rate', default=0.2, type=float, metavar='LR', -- help='base learning rate') --parser.add_argument('--dropout', default=0.01, type=float, metavar='d', -- help='dropout for training translation transformer') --parser.add_argument('--weight-decay', default=1e-6, type=float, metavar='W', -- help='weight decay') --parser.add_argument('--clip', default=1, type=float, metavar='GC', -- help='Gradient Clipping') --parser.add_argument('--betas', default=(0.9, 0.98), type=tuple, metavar='B', -- help='betas for Adam Optimizer') --parser.add_argument('--eps', default=1e-9, type=float, metavar='E', -- help='eps for Adam optimizer') --parser.add_argument('--loss_fn', default='cross_entropy', type=str, metavar='LF', -- help='loss function for translation') -- --# Transformer parameters: --parser.add_argument('--dmodel', default=768, type=int, metavar='T', -- help='dimension of transformer encoder') --parser.add_argument('--nhead', default=4, type= int, metavar='N', -- help= 'number of heads in transformer') --parser.add_argument('--dfeedforward', default=256, type=int, metavar='F', -- help= 'dimension of feedforward layer in transformer encoder') --parser.add_argument('--nlayers', default=3, type=int, metavar= 'N', -- help='number of layers of transformer encoder') --parser.add_argument('--projector', default='768-256', type=str, -- metavar='MLP', help='projector MLP') -- --# Tokenizer: --parser.add_argument('--tokenizer', default='bert-base-multilingual-uncased', type=str, -- metavar='T', help= 'tokenizer') --parser.add_argument('--mbert-out-size', default=768, type=int, metavar='MO', -- help='Dimension of mbert output') --# Paths: --parser.add_argument('--checkpoint_dir', default='./checkpoint/', type=Path, -- metavar='DIR', help='path to checkpoint directory') -- --# to load or barlow or not: --parser.add_argument('--load', default=0, type=int, -- metavar='DIR', help='to load barlow twins encoder or not') -- --# calculate bleu: --parser.add_argument('--checkbleu', default=5 , type=int, -- metavar='BL', help='check bleu after these number of epochs') --# train or test dataset --parser.add_argument('--train', default=True , type=bool, -- metavar='T', help='selecting train set') -- --parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --parser.add_argument('--test_translation', default=0, type=int, -- metavar='TT', help='testing translation_score') --''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. --''' -- --args = parser.parse_args() --# print(args.load) --os.environ["TOKENIZERS_PARALLELISM"] = "true" -- --def main(): -- -- # print("entered main") -- args.ngpus_per_node = torch.cuda.device_count() -- if 'SLURM_JOB_ID' in os.environ: -- # single-node and multi-node distributed training on SLURM cluster -- # requeue job on SLURM preemption -- signal.signal(signal.SIGUSR1, handle_sigusr1) -- signal.signal(signal.SIGTERM, handle_sigterm) -- # find a common host name on all nodes -- # assume scontrol returns hosts in the same order on all nodes -- cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') -- stdout = subprocess.check_output(cmd.split()) -- host_name = stdout.decode().splitlines()[0] -- args.rank = int(os.getenv('SLURM_NODEID')) * args.ngpus_per_node -- args.world_size = int(os.getenv('SLURM_NNODES')) * args.ngpus_per_node -- args.dist_url = f'tcp://{host_name}:58472' -- else: -- # single-node distributed training -- args.rank = 0 -- args.dist_url = 'tcp://localhost:58472' -- args.world_size = args.ngpus_per_node -- torch.multiprocessing.spawn(main_worker, (args,), args.ngpus_per_node) -- -- --def main_worker(gpu, args): -- -- args.rank += gpu -- torch.distributed.init_process_group( -- backend='nccl', init_method=args.dist_url, -- world_size=args.world_size, rank=args.rank) -- -- if args.rank == 0: -- -- wandb.init(config=args, project='translation_test')############################################# -- wandb.config.update(args) -- config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) -- stats_file = open(args.checkpoint_dir / 'stats.txt', 'a', buffering=1) -- print(' '.join(sys.argv)) -- print(' '.join(sys.argv), file=stats_file) -- -- torch.cuda.set_device(gpu) -- torch.backends.cudnn.benchmark = True -- -- dataset = Translation_dataset_t(train=args.train) -- src_vocab_size = dataset.de_vocab_size -- trg_vocab_size = dataset.en_vocab_size -- tokenizer = dataset.tokenizer -- pad_idx = tokenizer.pad_token_id -- sos_idx = tokenizer.cls_token_id -- eos_idx = tokenizer.sep_token_id -- --# transformer1 = nn.TransformerEncoderLayer(d_model = args.dmodel, nhead=args.nhead, dim_feedforward=args.dfeedforward, batch_first=True) -- # t_enc = nn.TransformerEncoder(transformer1, num_layers=args.nlayers) -- # print(src_vocab_size, trg_vocab_size) -- mbert = BertModel.from_pretrained('bert-base-multilingual-uncased') -- transformer = Transformer(d_model=args.dmodel, -- nhead=args.nhead, -- num_encoder_layers=args.nlayers, -- num_decoder_layers = args.nlayers, -- dim_feedforward=args.dfeedforward, -- dropout=args.dropout) -- model = Translator(mbert=mbert, transformer= transformer, tgt_vocab_size=trg_vocab_size, emb_size=args.mbert_out_size).cuda(gpu) -- # print(model.state_dict) --# model_barlow = barlow.BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=model.transformer.encoder, lambd=args.lambd).cuda(gpu) -- -- # args.load = False -- -- if args.load == 1: -- # print(args.load) -- # print('inside') -- print('loading barlow model') -- t_enc = model.transformer.encoder -- barlow = BarlowTwins(projector_layers=args.projector, mbert_out_size=args.mbert_out_size, transformer_enc=t_enc, mbert=mbert, lambd=0.0051).cuda(gpu) -- ### note: lambd is just a placeholder -- ckpt = torch.load(args.checkpoint_dir/ 'barlow_checkpoint.pth', -- map_location='cpu') -- barlow.load_state_dict(ckpt['model']) -- model.transformer.encoder = barlow.transformer_enc -- model.mbert = barlow.mbert -- ''' -- to_do: -- if post_train: -- torch.load(model.states_dict) -- model.transformer.encoder = model_barlow -- -- ''' --# model = nn.SyncBatchNorm.convert_sync_batchnorm(model) -- -- param_weights = [] -- param_biases = [] -- for param in model.parameters(): -- if param.ndim == 1: -- param_biases.append(param) -- else: -- param_weights.append(param) -- parameters = [{'params': param_weights}, {'params': param_biases}] -- model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu], find_unused_parameters=True) -- --########################################################### -- optimizer =torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=args.betas, eps=args.eps) -- -- if args.loss_fn == 'cross_entropy': -- loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx) --############################################################## -- -- start_epoch = 0 -- -- sampler = torch.utils.data.distributed.DistributedSampler(dataset) -- -- assert args.batch_size % args.world_size == 0 -- per_device_batch_size = args.batch_size // args.world_size -- ############################### -- loader = torch.utils.data.DataLoader( -- dataset, batch_size=per_device_batch_size, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- -- test_loader = torch.utils.data.DataLoader( -- dataset, batch_size=1, num_workers=args.workers, -- pin_memory=True, sampler=sampler, collate_fn = MyCollate(tokenizer=tokenizer,bert2id_dict=dataset.bert2id_dict)) -- ############################# -- start_time = time.time() -- -- -- if not args.test_translation: -- -- for epoch in range(start_epoch, args.epochs): -- sampler.set_epoch(epoch) -- epoch_loss = 0 -- for step, (sent) in enumerate(loader, start=epoch * len(loader)): -- src = sent[0].cuda(gpu, non_blocking=True) -- tgt_inp = sent[2].cuda(gpu, non_blocking=True) -- tgt_out = sent[3].cuda(gpu, non_blocking=True) -- -- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) -- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) -- -- optimizer.zero_grad() -- -- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) -- loss.backward() -- -- optimizer.step() -- # losses += loss.item() -- -- # wandb.log({'iter_loss': loss}) -- epoch_loss += loss.item() -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -- -- if step % args.print_freq == 0: -- if args.rank == 0: -- stats = dict(epoch=epoch, step=step, -- loss=loss.item(), -- time=int(time.time() - start_time)) -- print(json.dumps(stats)) -- print(json.dumps(stats), file=stats_file) -- if args.rank == 0: -- -- wandb.log({"epoch_loss":epoch_loss}) -- # save checkpoint -- state = dict(epoch=epoch + 1, model=model.module.state_dict(), -- optimizer=optimizer.state_dict()) -- # print(model.state_dict) -- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') -- print('translation model saved in', args.checkpoint_dir) -- -- ############################################################## -- if args.rank == 0: -- if epoch%args.checkbleu ==0 : -- -- bleu_score = checkbleu(model, tokenizer, test_loader, gpu) -- wandb.log({'bleu_score': bleu_score}) -- # print(bleu_score(predicted, target)) -- ############################################################## -- # if epoch%1 ==0 : -- # torch.save(model.module.state_dict(), -- # 'path.pth') -- # print("Model is saved") -- # if args.rank == 0: -- # # save checkpoint -- # state = dict(epoch=epoch + 1, model=model.state_dict(), -- # optimizer=optimizer.state_dict()) -- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') -- # print('saved translation model in', args.checkpoint_dir) -- wandb.finish() -- -- else: -- -- bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) -- print('test_bleu_score', bleu_score) -- if args.rank == 0: -- wandb.log({'bleu_score': bleu_score}) -- -- --def checkbleu(model, tokenizer, test_loader, gpu): -- -- model.eval() -- predicted=[] -- target=[] -- -- for i in test_loader: -- src = i[0].cuda(gpu, non_blocking=True) -- tgt_out = i[3].cuda(gpu, non_blocking=True) -- num_tokens = src.shape[0] -- -- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) -- out = translate(model, src, tokenizer, src_mask, gpu) -- predicted.append(out) -- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- -- try: -- bleu_score(predicted, target) -- except: -- predicted.pop() -- target.pop() -- -- bleu = bleu_score(predicted, target) -- -- return bleu -- --''' --todo: -- BLEU score --''' -- --# function to generate output sequence using greedy algorithm --def greedy_decode(model, src, src_mask, max_len, start_symbol, eos_idx, gpu): -- src = src -- src_mask = src_mask -- -- memory = model.module.encode(src, src_mask) -- ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).cuda(gpu, non_blocking=True) -- for i in range(max_len-1): -- memory = memory -- tgt_mask = (translation_utils.generate_square_subsequent_mask(ys.size(0)) -- .type(torch.bool)).cuda(gpu, non_blocking=True) -- out = model.module.decode(ys, memory, tgt_mask) -- out = out.transpose(0, 1) -- prob = model.module.generator(out[:, -1]) -- _, next_word = torch.max(prob, dim=1) -- next_word = next_word.item() -- -- ys = torch.cat([ys, -- torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) -- if next_word == eos_idx: -- break -- return ys -- -- --# actual function to translate input sentence into target language --def translate(model: torch.nn.Module, -- src: torch.tensor, -- tokenizer,src_mask, gpu): -- model.eval() -- -- num_tokens = src.shape[0] -- -- -- tgt_tokens = greedy_decode( -- model, src, src_mask, max_len=num_tokens + 5, start_symbol=tokenizer.cls_token_id, eos_idx=tokenizer.sep_token_id, gpu=gpu).flatten() -- return tokenizer.convert_ids_to_tokens(tgt_tokens) -- -- --if __name__ == '__main__': -- main() -- wandb.finish() -diff --git a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml b/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -deleted file mode 100644 -index 72eed10..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -+++ /dev/null -@@ -1,124 +0,0 @@ --name: ectc --channels: -- - pytorch -- - defaults --dependencies: -- - _libgcc_mutex=0.1=main -- - _openmp_mutex=4.5=1_gnu -- - blas=1.0=mkl -- - brotlipy=0.7.0=py37h27cfd23_1003 -- - bzip2=1.0.8=h7b6447c_0 -- - ca-certificates=2022.3.18=h06a4308_0 -- - certifi=2021.10.8=py37h06a4308_2 -- - cffi=1.15.0=py37hd667e15_1 -- - cryptography=36.0.0=py37h9ce1e76_0 -- - cudatoolkit=11.3.1=h2bc3f7f_2 -- - ffmpeg=4.3=hf484d3e_0 -- - freetype=2.11.0=h70c0345_0 -- - giflib=5.2.1=h7b6447c_0 -- - gmp=6.2.1=h2531618_2 -- - gnutls=3.6.15=he1e5248_0 -- - idna=3.3=pyhd3eb1b0_0 -- - intel-openmp=2021.4.0=h06a4308_3561 -- - jpeg=9d=h7f8727e_0 -- - lame=3.100=h7b6447c_0 -- - lcms2=2.12=h3be6417_0 -- - ld_impl_linux-64=2.35.1=h7274673_9 -- - libffi=3.3=he6710b0_2 -- - libgcc-ng=9.3.0=h5101ec6_17 -- - libgomp=9.3.0=h5101ec6_17 -- - libiconv=1.15=h63c8f33_5 -- - libidn2=2.3.2=h7f8727e_0 -- - libpng=1.6.37=hbc83047_0 -- - libstdcxx-ng=9.3.0=hd4cf53a_17 -- - libtasn1=4.16.0=h27cfd23_0 -- - libtiff=4.2.0=h85742a9_0 -- - libunistring=0.9.10=h27cfd23_0 -- - libuv=1.40.0=h7b6447c_0 -- - libwebp=1.2.2=h55f646e_0 -- - libwebp-base=1.2.2=h7f8727e_0 -- - lz4-c=1.9.3=h295c915_1 -- - mkl=2021.4.0=h06a4308_640 -- - mkl-service=2.4.0=py37h7f8727e_0 -- - mkl_fft=1.3.1=py37hd3c417c_0 -- - mkl_random=1.2.2=py37h51133e4_0 -- - ncurses=6.3=h7f8727e_2 -- - nettle=3.7.3=hbbd107a_1 -- - numpy-base=1.21.2=py37h79a1101_0 -- - openh264=2.1.1=h4ff587b_0 -- - openssl=1.1.1n=h7f8727e_0 -- - pip=21.2.2=py37h06a4308_0 -- - pycparser=2.21=pyhd3eb1b0_0 -- - pyopenssl=22.0.0=pyhd3eb1b0_0 -- - pysocks=1.7.1=py37_1 -- - python=3.7.11=h12debd9_0 -- - pytorch=1.11.0=py3.7_cuda11.3_cudnn8.2.0_0 -- - pytorch-mutex=1.0=cuda -- - readline=8.1.2=h7f8727e_1 -- - requests=2.27.1=pyhd3eb1b0_0 -- - setuptools=58.0.4=py37h06a4308_0 -- - six=1.16.0=pyhd3eb1b0_1 -- - sqlite=3.38.0=hc218d9a_0 -- - tk=8.6.11=h1ccaba5_0 -- - torchaudio=0.11.0=py37_cu113 -- - typing_extensions=4.1.1=pyh06a4308_0 -- - wheel=0.37.1=pyhd3eb1b0_0 -- - xz=5.2.5=h7b6447c_0 -- - zlib=1.2.11=h7f8727e_4 -- - zstd=1.4.9=haebb681_0 -- - pip: -- - aiohttp==3.8.1 -- - aiosignal==1.2.0 -- - async-timeout==4.0.2 -- - asynctest==0.13.0 -- - attrs==21.4.0 -- - blessings==1.7 -- - charset-normalizer==2.0.12 -- - click==8.0.4 -- - configparser==5.2.0 -- - datasets==1.16.1 -- - dill==0.3.4 -- - docker-pycreds==0.4.0 -- - filelock==3.6.0 -- - frozenlist==1.3.0 -- - fsspec==2022.2.0 -- - gitdb==4.0.9 -- - gitpython==3.1.27 -- - gpustat==0.6.0 -- - huggingface-hub==0.4.0 -- - importlib-metadata==4.11.3 -- - joblib==1.1.0 -- - multidict==6.0.2 -- - multiprocess==0.70.12.2 -- - numpy==1.21.5 -- - nvidia-ml-py3==7.352.0 -- - packaging==21.3 -- - pandas==1.3.5 -- - pathtools==0.1.2 -- - pillow==9.0.1 -- - promise==2.3 -- - protobuf==3.19.4 -- - psutil==5.9.0 -- - pyarrow==7.0.0 -- - pyparsing==3.0.7 -- - python-dateutil==2.8.2 -- - pytz==2022.1 -- - pyyaml==6.0 -- - regex==2022.3.15 -- - sacremoses==0.0.49 -- - sentry-sdk==1.5.8 -- - shortuuid==1.0.8 -- - smmap==5.0.0 -- - subprocess32==3.5.4 -- - tokenizers==0.10.3 -- - torch==1.11.0 -- - torchtext==0.12.0 -- - torchvision==0.9.1 -- - tqdm==4.63.1 -- - transformers==4.14.1 -- - urllib3==1.26.9 -- - wandb==0.10.31 -- - xxhash==3.0.0 -- - yarl==1.7.2 -- - zipp==3.7.0 --prefix: /home/ivlabs/miniconda3/envs/ectc -diff --git a/wandb/run-20220409_182749-paufev36/files/config.yaml b/wandb/run-20220409_182749-paufev36/files/config.yaml -deleted file mode 100644 -index c4a0d20..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/config.yaml -+++ /dev/null -@@ -1,104 +0,0 @@ --wandb_version: 1 -- --_wandb: -- desc: null -- value: -- cli_version: 0.10.31 -- code_path: code/train_translation.py -- framework: huggingface -- huggingface_version: 4.14.1 -- is_jupyter_run: false -- is_kaggle_kernel: false -- python_version: 3.7.11 -- t: -- 1: -- - 1 -- - 11 -- 4: 3.7.11 -- 5: 0.10.31 -- 6: 4.14.1 -- 8: -- - 8 --batch_size: -- desc: null -- value: 32 --betas: -- desc: null -- value: -- - 0.9 -- - 0.98 --checkbleu: -- desc: null -- value: 5 --checkpoint_dir: -- desc: null -- value: checkpoint --clip: -- desc: null -- value: 1 --dfeedforward: -- desc: null -- value: 1024 --dist_url: -- desc: null -- value: tcp://localhost:58472 --dmodel: -- desc: null -- value: 768 --dropout: -- desc: null -- value: 0.01 --epochs: -- desc: null -- value: 32 --eps: -- desc: null -- value: 1.0e-09 --learning_rate: -- desc: null -- value: 0.2 --load: -- desc: null -- value: 0 --loss_fn: -- desc: null -- value: cross_entropy --mbert_out_size: -- desc: null -- value: 768 --ngpus_per_node: -- desc: null -- value: 2 --nhead: -- desc: null -- value: 2 --nlayers: -- desc: null -- value: 4 --print_freq: -- desc: null -- value: 5 --projector: -- desc: null -- value: 768-256 --rank: -- desc: null -- value: 0 --test_translation: -- desc: null -- value: 0 --tokenizer: -- desc: null -- value: bert-base-multilingual-uncased --train: -- desc: null -- value: true --weight_decay: -- desc: null -- value: 1.0e-06 --workers: -- desc: null -- value: 4 --world_size: -- desc: null -- value: 2 -diff --git a/wandb/run-20220409_182749-paufev36/files/diff.patch b/wandb/run-20220409_182749-paufev36/files/diff.patch -deleted file mode 100644 -index 17f6c34..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/diff.patch -+++ /dev/null -@@ -1,694 +0,0 @@ --diff --git a/__pycache__/barlow.cpython-37.pyc b/__pycache__/barlow.cpython-37.pyc --index d9b3757..420c21a 100644 --Binary files a/__pycache__/barlow.cpython-37.pyc and b/__pycache__/barlow.cpython-37.pyc differ --diff --git a/__pycache__/train_translation.cpython-37.pyc b/__pycache__/train_translation.cpython-37.pyc --index 7bf3ea7..b5b1fb5 100644 --Binary files a/__pycache__/train_translation.cpython-37.pyc and b/__pycache__/train_translation.cpython-37.pyc differ --diff --git a/barlow.py b/barlow.py --index 99b0da9..b20d671 100644 ----- a/barlow.py --+++ b/barlow.py --@@ -265,13 +265,6 @@ def main_worker(gpu, args): -- optimizer=optimizer.state_dict()) -- torch.save(state, args.checkpoint_dir / 'barlow_checkpoint.pth') -- print('barlow model saved in', args.checkpoint_dir) --- for sent in test_loader: --- y1 = sent[0].cuda(gpu, non_blocking=True) --- y2 = sent[1].cuda(gpu, non_blocking=True) --- model.eval() --- c, _ = model(y1, y2) --- xlabels = tokenizer.convert_ids_to_tokens(y2) --- ylabels = tokenizer.convert_ids_to_tokens(y1) -- # wandb.finish() -- # if args.rank == 0: -- # save final model --diff --git a/checkpoint/stats.txt b/checkpoint/stats.txt --index 97f9eb6..e8bd4e3 100644 ----- a/checkpoint/stats.txt --+++ b/checkpoint/stats.txt --@@ -467,3 +467,362 @@ train_translation.py -- {"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} -- {"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} -- {"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 133} --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 6} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 7} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 7} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 8} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 8} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 9} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 9} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 60} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 61} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 61} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 62} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 63} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 63} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 64} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 64} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 65} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 85} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 85} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 86} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 86} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 87} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 88} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 88} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 89} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 89} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 106} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 106} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 107} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 108} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 108} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 109} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 109} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 110} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 110} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 129} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 130} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 130} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 131} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 131} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 132} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 132} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 133} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 134} --+/home/ivlabs/context_enhancement/context_enhancement/barlow.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 1 --+/home/ivlabs/context_enhancement/context_enhancement/train_translation.py --load 0 --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=28 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120170593261719, "time": 5} --+{"epoch": 0, "step": 5, "loss": 151.9119415283203, "time": 8} --+{"epoch": 1, "step": 10, "loss": 112.8124008178711, "time": 84} --+{"epoch": 2, "step": 15, "loss": 47.12509536743164, "time": 111} --+{"epoch": 3, "step": 20, "loss": 45.04984664916992, "time": 139} --+{"epoch": 4, "step": 25, "loss": 38.9657096862793, "time": 165} --+{"epoch": 5, "step": 30, "loss": 60.226715087890625, "time": 190} --+{"epoch": 5, "step": 35, "loss": 65.24925231933594, "time": 192} --+{"epoch": 6, "step": 40, "loss": 65.57554626464844, "time": 268} --+{"epoch": 7, "step": 45, "loss": 61.62765121459961, "time": 294} --+{"epoch": 8, "step": 50, "loss": 64.9477310180664, "time": 319} --+{"epoch": 9, "step": 55, "loss": 72.8912353515625, "time": 344} --+{"epoch": 10, "step": 60, "loss": 86.97362518310547, "time": 369} --+{"epoch": 10, "step": 65, "loss": 112.7873306274414, "time": 372} --+{"epoch": 11, "step": 70, "loss": 88.19213104248047, "time": 447} --+{"epoch": 12, "step": 75, "loss": 73.24372863769531, "time": 472} --+{"epoch": 13, "step": 80, "loss": 73.8764419555664, "time": 498} --+{"epoch": 14, "step": 85, "loss": 87.44139099121094, "time": 525} --+{"epoch": 15, "step": 90, "loss": 66.60698699951172, "time": 551} --+{"epoch": 15, "step": 95, "loss": 80.11738586425781, "time": 553} --+{"epoch": 16, "step": 100, "loss": 88.93124389648438, "time": 624} --+{"epoch": 17, "step": 105, "loss": 74.59225463867188, "time": 649} --+{"epoch": 18, "step": 110, "loss": 108.9293441772461, "time": 675} --+{"epoch": 19, "step": 115, "loss": 87.63671112060547, "time": 700} --+{"epoch": 20, "step": 120, "loss": 99.23358154296875, "time": 725} --+{"epoch": 20, "step": 125, "loss": 118.16622924804688, "time": 727} --+{"epoch": 21, "step": 130, "loss": 102.9515380859375, "time": 801} --+{"epoch": 22, "step": 135, "loss": 80.40345764160156, "time": 827} --+{"epoch": 23, "step": 140, "loss": 87.99221801757812, "time": 852} --+{"epoch": 24, "step": 145, "loss": 63.2794303894043, "time": 876} --+{"epoch": 25, "step": 150, "loss": 78.17864227294922, "time": 902} --+{"epoch": 25, "step": 155, "loss": 100.8608169555664, "time": 904} --+{"epoch": 26, "step": 160, "loss": 88.68865203857422, "time": 976} --+{"epoch": 27, "step": 165, "loss": 84.6174087524414, "time": 1002} --+train_translation.py --batch_size=256 --dfeedforward=512 --epochs=32 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.139744758605957, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=36 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 6} --+train_translation.py --batch_size=16 --dfeedforward=1024 --epochs=32 --nhead=6 --nlayers=2 --+{"epoch": 0, "step": 0, "loss": 7.180241584777832, "time": 5} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=20 --nhead=8 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.120020389556885, "time": 6} --+train_translation.py --batch_size=64 --dfeedforward=512 --epochs=32 --nhead=2 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.082856178283691, "time": 6} --+train_translation.py --batch_size=128 --dfeedforward=512 --epochs=16 --nhead=6 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140233993530273, "time": 6} --+train_translation.py --batch_size=256 --dfeedforward=256 --epochs=40 --nhead=6 --nlayers=2 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 4} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 4} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 5} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 5} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 6} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 6} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 54} --+{"epoch": 1, "step": 50, "loss": 83.65938568115234, "time": 54} --+{"epoch": 1, "step": 55, "loss": 121.91883850097656, "time": 54} --+{"epoch": 1, "step": 60, "loss": 69.18376159667969, "time": 55} --+{"epoch": 1, "step": 65, "loss": 108.74915313720703, "time": 55} --+{"epoch": 1, "step": 70, "loss": 72.30828094482422, "time": 55} --+{"epoch": 1, "step": 75, "loss": 71.81124114990234, "time": 56} --+{"epoch": 1, "step": 80, "loss": 68.34536743164062, "time": 56} --+{"epoch": 1, "step": 85, "loss": 60.85449981689453, "time": 56} --+{"epoch": 2, "step": 90, "loss": 77.74386596679688, "time": 77} --+{"epoch": 2, "step": 95, "loss": 96.72307586669922, "time": 77} --+{"epoch": 2, "step": 100, "loss": 101.80294036865234, "time": 77} --+{"epoch": 2, "step": 105, "loss": 84.51009368896484, "time": 78} --+{"epoch": 2, "step": 110, "loss": 72.72525787353516, "time": 78} --+{"epoch": 2, "step": 115, "loss": 74.45042419433594, "time": 78} --+{"epoch": 2, "step": 120, "loss": 67.41654968261719, "time": 79} --+{"epoch": 2, "step": 125, "loss": 78.1681137084961, "time": 79} --+{"epoch": 2, "step": 130, "loss": 92.35138702392578, "time": 79} --+{"epoch": 3, "step": 135, "loss": 67.62174224853516, "time": 97} --+{"epoch": 3, "step": 140, "loss": 73.0427017211914, "time": 97} --+{"epoch": 3, "step": 145, "loss": 105.50846099853516, "time": 98} --+{"epoch": 3, "step": 150, "loss": 80.58209991455078, "time": 98} --+{"epoch": 3, "step": 155, "loss": 93.44019317626953, "time": 98} --+{"epoch": 3, "step": 160, "loss": 89.55480194091797, "time": 99} --+{"epoch": 3, "step": 165, "loss": 105.64498138427734, "time": 99} --+{"epoch": 3, "step": 170, "loss": 114.21644592285156, "time": 99} --+{"epoch": 3, "step": 175, "loss": 132.64865112304688, "time": 100} --+{"epoch": 4, "step": 180, "loss": 123.47101593017578, "time": 116} --+{"epoch": 4, "step": 185, "loss": 98.48711395263672, "time": 117} --+{"epoch": 4, "step": 190, "loss": 106.57389831542969, "time": 117} --+{"epoch": 4, "step": 195, "loss": 123.41980743408203, "time": 118} --+{"epoch": 4, "step": 200, "loss": 133.0455322265625, "time": 118} --+{"epoch": 4, "step": 205, "loss": 115.12477111816406, "time": 118} --+{"epoch": 4, "step": 210, "loss": 173.08377075195312, "time": 119} --+{"epoch": 4, "step": 215, "loss": 95.62724304199219, "time": 119} --+{"epoch": 4, "step": 220, "loss": 146.6149444580078, "time": 119} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 5} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 7} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+train_translation.py --load 0 --+{"epoch": 0, "step": 0, "loss": 7.142178058624268, "time": 4} --+{"epoch": 0, "step": 5, "loss": 69.92982482910156, "time": 5} --+{"epoch": 0, "step": 10, "loss": 187.95425415039062, "time": 5} --+{"epoch": 0, "step": 15, "loss": 116.46453094482422, "time": 5} --+{"epoch": 0, "step": 20, "loss": 103.49996948242188, "time": 6} --+{"epoch": 0, "step": 25, "loss": 109.99765014648438, "time": 6} --+{"epoch": 0, "step": 30, "loss": 82.7474365234375, "time": 6} --+{"epoch": 0, "step": 35, "loss": 81.3102798461914, "time": 7} --+{"epoch": 0, "step": 40, "loss": 68.49085235595703, "time": 7} --+{"epoch": 1, "step": 45, "loss": 83.40009307861328, "time": 55} --+{"epoch": 1, "step": 50, "loss": 83.36439514160156, "time": 55} --+{"epoch": 1, "step": 55, "loss": 117.81816101074219, "time": 56} --+{"epoch": 1, "step": 60, "loss": 70.09979248046875, "time": 56} --+{"epoch": 1, "step": 65, "loss": 90.87323760986328, "time": 57} --+{"epoch": 1, "step": 70, "loss": 60.27517318725586, "time": 57} --+{"epoch": 1, "step": 75, "loss": 99.74661254882812, "time": 57} --+{"epoch": 1, "step": 80, "loss": 76.57121276855469, "time": 58} --+{"epoch": 1, "step": 85, "loss": 85.32162475585938, "time": 58} --+{"epoch": 2, "step": 90, "loss": 79.57125091552734, "time": 104} --+{"epoch": 2, "step": 95, "loss": 145.4536590576172, "time": 104} --+{"epoch": 2, "step": 100, "loss": 72.27653503417969, "time": 105} --+{"epoch": 2, "step": 105, "loss": 90.55571746826172, "time": 105} --+{"epoch": 2, "step": 110, "loss": 83.55565643310547, "time": 105} --+{"epoch": 2, "step": 115, "loss": 61.579551696777344, "time": 106} --+{"epoch": 2, "step": 120, "loss": 98.33128356933594, "time": 107} --+{"epoch": 2, "step": 125, "loss": 128.28770446777344, "time": 107} --+{"epoch": 2, "step": 130, "loss": 82.06121063232422, "time": 108} --+{"epoch": 3, "step": 135, "loss": 78.25971221923828, "time": 128} --+{"epoch": 3, "step": 140, "loss": 75.09734344482422, "time": 128} --+{"epoch": 3, "step": 145, "loss": 109.36125183105469, "time": 128} --+{"epoch": 3, "step": 150, "loss": 102.68833923339844, "time": 129} --+{"epoch": 3, "step": 155, "loss": 102.20543670654297, "time": 129} --+{"epoch": 3, "step": 160, "loss": 98.07948303222656, "time": 129} --+{"epoch": 3, "step": 165, "loss": 99.76647186279297, "time": 130} --+{"epoch": 3, "step": 170, "loss": 98.70307159423828, "time": 130} --+{"epoch": 3, "step": 175, "loss": 102.44486999511719, "time": 131} --+{"epoch": 4, "step": 180, "loss": 101.29882049560547, "time": 150} --+{"epoch": 4, "step": 185, "loss": 113.0394287109375, "time": 150} --+{"epoch": 4, "step": 190, "loss": 102.2679214477539, "time": 150} --+{"epoch": 4, "step": 195, "loss": 88.9566650390625, "time": 151} --+{"epoch": 4, "step": 200, "loss": 80.84623718261719, "time": 151} --+{"epoch": 4, "step": 205, "loss": 173.88238525390625, "time": 151} --+{"epoch": 4, "step": 210, "loss": 138.01107788085938, "time": 152} --+{"epoch": 4, "step": 215, "loss": 116.2401351928711, "time": 152} --+{"epoch": 4, "step": 220, "loss": 119.53892517089844, "time": 153} --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --load 0 --test_translation 1 --+train_translation.py --+{"epoch": 0, "step": 0, "loss": 7.122797966003418, "time": 4} --+{"epoch": 0, "step": 5, "loss": 198.62460327148438, "time": 4} --+{"epoch": 0, "step": 10, "loss": 119.2225341796875, "time": 4} --+{"epoch": 0, "step": 15, "loss": 89.00521850585938, "time": 4} --+{"epoch": 0, "step": 20, "loss": 144.1955108642578, "time": 5} --+{"epoch": 0, "step": 25, "loss": 126.2806396484375, "time": 5} --+{"epoch": 0, "step": 30, "loss": 115.61041259765625, "time": 5} --+{"epoch": 0, "step": 35, "loss": 84.10115814208984, "time": 5} --+{"epoch": 0, "step": 40, "loss": 65.00213623046875, "time": 5} --+{"epoch": 1, "step": 45, "loss": 79.53411865234375, "time": 75} --+{"epoch": 1, "step": 50, "loss": 81.8320541381836, "time": 75} --+{"epoch": 1, "step": 55, "loss": 97.07718658447266, "time": 75} --+{"epoch": 1, "step": 60, "loss": 77.87088012695312, "time": 75} --+{"epoch": 1, "step": 65, "loss": 91.45843505859375, "time": 75} --+{"epoch": 1, "step": 70, "loss": 81.77067565917969, "time": 76} --+{"epoch": 1, "step": 75, "loss": 93.20482635498047, "time": 76} --+{"epoch": 1, "step": 80, "loss": 96.80836486816406, "time": 76} --+{"epoch": 1, "step": 85, "loss": 99.4000473022461, "time": 76} --+{"epoch": 2, "step": 90, "loss": 84.4419174194336, "time": 95} --+{"epoch": 2, "step": 95, "loss": 89.35089111328125, "time": 95} --+{"epoch": 2, "step": 100, "loss": 70.36296081542969, "time": 96} --+{"epoch": 2, "step": 105, "loss": 93.40479278564453, "time": 96} --+{"epoch": 2, "step": 110, "loss": 85.92987823486328, "time": 96} --+{"epoch": 2, "step": 115, "loss": 84.89830780029297, "time": 96} --+{"epoch": 2, "step": 120, "loss": 88.87590789794922, "time": 96} --+{"epoch": 2, "step": 125, "loss": 89.31674194335938, "time": 96} --+{"epoch": 2, "step": 130, "loss": 114.93965911865234, "time": 97} --+{"epoch": 3, "step": 135, "loss": 76.80366516113281, "time": 115} --+{"epoch": 3, "step": 140, "loss": 140.8549346923828, "time": 115} --+{"epoch": 3, "step": 145, "loss": 113.339111328125, "time": 116} --+{"epoch": 3, "step": 150, "loss": 93.06966400146484, "time": 116} --+{"epoch": 3, "step": 155, "loss": 113.3215103149414, "time": 116} --+{"epoch": 3, "step": 160, "loss": 109.3653335571289, "time": 116} --+{"epoch": 3, "step": 165, "loss": 139.5435333251953, "time": 116} --+{"epoch": 3, "step": 170, "loss": 76.41168975830078, "time": 117} --+{"epoch": 3, "step": 175, "loss": 132.55953979492188, "time": 117} --+{"epoch": 4, "step": 180, "loss": 109.78890228271484, "time": 143} --+{"epoch": 4, "step": 185, "loss": 88.3539810180664, "time": 143} --+{"epoch": 4, "step": 190, "loss": 113.5445327758789, "time": 144} --+{"epoch": 4, "step": 195, "loss": 107.1954345703125, "time": 144} --+{"epoch": 4, "step": 200, "loss": 127.9149398803711, "time": 144} --+{"epoch": 4, "step": 205, "loss": 131.3365936279297, "time": 144} --+{"epoch": 4, "step": 210, "loss": 129.23558044433594, "time": 145} --+{"epoch": 4, "step": 215, "loss": 86.24095153808594, "time": 145} --+{"epoch": 4, "step": 220, "loss": 143.04344177246094, "time": 145} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 679.4036254882812, "time": 10} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+barlow.py --+{"epoch": 0, "step": 0, "lr_weights": 0.0, "lr_biases": 0.0, "loss": 456.90240478515625, "time": 8} --+train_translation.py --batch_size=128 --dfeedforward=1024 --epochs=24 --nhead=4 --nlayers=4 --+{"epoch": 0, "step": 0, "loss": 7.140841484069824, "time": 5} --+{"epoch": 2, "step": 5, "loss": 253.87469482421875, "time": 74} --+{"epoch": 5, "step": 10, "loss": 150.13229370117188, "time": 139} --+{"epoch": 7, "step": 15, "loss": 106.13131713867188, "time": 216} --+{"epoch": 10, "step": 20, "loss": 77.7083511352539, "time": 285} --+{"epoch": 12, "step": 25, "loss": 74.31400299072266, "time": 365} --+{"epoch": 15, "step": 30, "loss": 74.50468444824219, "time": 432} --+{"epoch": 17, "step": 35, "loss": 62.94711685180664, "time": 515} --+{"epoch": 20, "step": 40, "loss": 59.828826904296875, "time": 583} --+{"epoch": 22, "step": 45, "loss": 62.49226379394531, "time": 663} --+train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=40 --nhead=4 --nlayers=6 --+{"epoch": 0, "step": 0, "loss": 7.117185592651367, "time": 5} --+{"epoch": 0, "step": 5, "loss": 240.16217041015625, "time": 6} --+{"epoch": 1, "step": 10, "loss": 155.1521453857422, "time": 76} --+{"epoch": 2, "step": 15, "loss": 137.45753479003906, "time": 101} --+{"epoch": 3, "step": 20, "loss": 117.7391357421875, "time": 127} --+{"epoch": 4, "step": 25, "loss": 71.79619598388672, "time": 154} --+{"epoch": 5, "step": 30, "loss": 74.55005645751953, "time": 182} --+{"epoch": 5, "step": 35, "loss": 71.86864471435547, "time": 183} --+{"epoch": 6, "step": 40, "loss": 67.3455810546875, "time": 253} --+{"epoch": 7, "step": 45, "loss": 85.43989562988281, "time": 279} --+{"epoch": 8, "step": 50, "loss": 85.58329772949219, "time": 305} --+{"epoch": 9, "step": 55, "loss": 75.13690948486328, "time": 333} --+{"epoch": 10, "step": 60, "loss": 99.44623565673828, "time": 361} --+{"epoch": 10, "step": 65, "loss": 92.4845962524414, "time": 362} --+{"epoch": 11, "step": 70, "loss": 70.49784851074219, "time": 435} --+{"epoch": 12, "step": 75, "loss": 106.4268569946289, "time": 458} --+{"epoch": 13, "step": 80, "loss": 66.5932388305664, "time": 487} --+{"epoch": 14, "step": 85, "loss": 88.70879364013672, "time": 511} --+{"epoch": 15, "step": 90, "loss": 81.76454162597656, "time": 535} --+{"epoch": 15, "step": 95, "loss": 56.718807220458984, "time": 536} --+{"epoch": 16, "step": 100, "loss": 73.56828308105469, "time": 599} --+{"epoch": 17, "step": 105, "loss": 87.1954116821289, "time": 623} --+{"epoch": 18, "step": 110, "loss": 81.27310180664062, "time": 649} --+{"epoch": 19, "step": 115, "loss": 118.82411193847656, "time": 673} --+{"epoch": 20, "step": 120, "loss": 104.59524536132812, "time": 699} --+{"epoch": 20, "step": 125, "loss": 91.45010375976562, "time": 701} --+{"epoch": 21, "step": 130, "loss": 96.45476531982422, "time": 768} --+{"epoch": 22, "step": 135, "loss": 73.63231658935547, "time": 792} --+{"epoch": 23, "step": 140, "loss": 81.41030883789062, "time": 820} --+{"epoch": 24, "step": 145, "loss": 68.5522232055664, "time": 845} --+{"epoch": 25, "step": 150, "loss": 87.08369445800781, "time": 877} --+{"epoch": 25, "step": 155, "loss": 60.33863830566406, "time": 878} --+{"epoch": 26, "step": 160, "loss": 90.980224609375, "time": 943} --+{"epoch": 27, "step": 165, "loss": 89.83417510986328, "time": 967} --+{"epoch": 28, "step": 170, "loss": 59.04204177856445, "time": 995} --+{"epoch": 29, "step": 175, "loss": 76.57648468017578, "time": 1020} --+{"epoch": 30, "step": 180, "loss": 79.04066467285156, "time": 1047} --+{"epoch": 30, "step": 185, "loss": 116.04915618896484, "time": 1048} --+{"epoch": 31, "step": 190, "loss": 96.91857147216797, "time": 1120} --+{"epoch": 32, "step": 195, "loss": 117.3604965209961, "time": 1142} --+{"epoch": 33, "step": 200, "loss": 79.40359497070312, "time": 1173} --+{"epoch": 34, "step": 205, "loss": 118.38796997070312, "time": 1199} --+{"epoch": 35, "step": 210, "loss": 100.85802459716797, "time": 1227} --+{"epoch": 35, "step": 215, "loss": 127.6283187866211, "time": 1228} --+{"epoch": 36, "step": 220, "loss": 107.0147705078125, "time": 1295} --+{"epoch": 37, "step": 225, "loss": 101.71541595458984, "time": 1319} --+{"epoch": 38, "step": 230, "loss": 109.91344451904297, "time": 1354} --+{"epoch": 39, "step": 235, "loss": 91.43553924560547, "time": 1382} --diff --git a/sweep.yaml b/sweep.yaml --index 6402430..ae76056 100644 ----- a/sweep.yaml --+++ b/sweep.yaml --@@ -1,17 +1,20 @@ ---program: main.py --+program: train_translation.py -- method: bayes -- metric: -- name: epoch_loss -- goal: minimize -- ---description: 'trial2 learning q distributions' --+description: 'translation sweep' -- parameters: -- --- lambd: --+ epochs: -- distribution: 'q_uniform' --- min: 0 --- max: 1 --- q: 0.05 --+ min: 10 --+ max: 40 --+ q: 4 --+ --+ batch_size: --+ values: [16, 32, 64, 128, 256] -- -- nhead: -- distribution: 'q_uniform' --@@ -19,6 +22,9 @@ parameters: -- max: 8 -- q: 2 -- --+ dfeedforward: --+ values: [ 256, 512, 1024] --+ -- nlayers: -- distribution: 'q_uniform' -- min: 2 --@@ -26,6 +32,6 @@ parameters: -- q: 2 -- -- --- --+ # to add: lr, dropout, betas, loss_fn -- -- --diff --git a/test_translation.py b/test_translation.py --index 67aad1e..47a6ecd 100644 ----- a/test_translation.py --+++ b/test_translation.py --@@ -5,13 +5,20 @@ import os -- -- -- # translation pretraining --+# sweep translation --+# wandb sweep_translation.yaml -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') -- -- # context enhancement --+# sweep barlow with translation encoder hyper-params --+# sweep sweep_barlow.yaml -- os.system('python ~/context_enhancement/context_enhancement/barlow.py --load 1') -- -- # tranining translation --+#train translation with translation hyper-params --+#python train_translation.py -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 1') -- -- # testing translation --+# no need -- os.system('python ~/context_enhancement/context_enhancement/train_translation.py --load 0') --diff --git a/train_translation.py b/train_translation.py --index 1b0fe42..f284015 100644 ----- a/train_translation.py --+++ b/train_translation.py --@@ -101,6 +101,8 @@ parser.add_argument('--train', default=True , type=bool, -- parser.add_argument('--print_freq', default=5 , type=int, -- metavar='PF', help='frequency of printing and saving stats') -- --+parser.add_argument('--test_translation', default=0, type=int, --+ metavar='TT', help='testing translation_score') -- ''' NOTE: -- Transformer and tokenizer arguments would remain constant in training and context enhancement step. -- ''' --@@ -143,9 +145,9 @@ def main_worker(gpu, args): -- -- if args.rank == 0: -- ---# wandb.init(config=args, project='translation_test')############################################# ---# wandb.config.update(args) ---# config = wandb.config --+ wandb.init(config=args, project='translation_test')############################################# --+ wandb.config.update(args) --+ config = wandb.config -- -- # exit() -- args.checkpoint_dir.mkdir(parents=True, exist_ok=True) --@@ -236,84 +238,101 @@ def main_worker(gpu, args): -- start_time = time.time() -- -- --- --- for epoch in range(start_epoch, args.epochs): --- sampler.set_epoch(epoch) --- epoch_loss = 0 --- for step, (sent) in enumerate(loader, start=epoch * len(loader)): --- src = sent[0].cuda(gpu, non_blocking=True) --- tgt_inp = sent[2].cuda(gpu, non_blocking=True) --- tgt_out = sent[3].cuda(gpu, non_blocking=True) --- --- src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --- logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --- --- optimizer.zero_grad() --+ if not args.test_translation: -- --- loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --- loss.backward() --+ for epoch in range(start_epoch, args.epochs): --+ sampler.set_epoch(epoch) --+ epoch_loss = 0 --+ for step, (sent) in enumerate(loader, start=epoch * len(loader)): --+ src = sent[0].cuda(gpu, non_blocking=True) --+ tgt_inp = sent[2].cuda(gpu, non_blocking=True) --+ tgt_out = sent[3].cuda(gpu, non_blocking=True) --+ --+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_inp, pad_idx) --+ logits = model(src, tgt_inp, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask) --+ --+ optimizer.zero_grad() --+ --+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) --+ loss.backward() -- --- optimizer.step() --- # losses += loss.item() --+ optimizer.step() --+ # losses += loss.item() --+ --+ # wandb.log({'iter_loss': loss}) --+ epoch_loss += loss.item() --+ torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ --+ if step % args.print_freq == 0: --+ if args.rank == 0: --+ stats = dict(epoch=epoch, step=step, --+ loss=loss.item(), --+ time=int(time.time() - start_time)) --+ print(json.dumps(stats)) --+ print(json.dumps(stats), file=stats_file) --+ if args.rank == 0: --+ --+ wandb.log({"epoch_loss":epoch_loss}) --+ # save checkpoint --+ state = dict(epoch=epoch + 1, model=model.module.state_dict(), --+ optimizer=optimizer.state_dict()) --+ # print(model.state_dict) --+ torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --+ print('translation model saved in', args.checkpoint_dir) -- ---# wandb.log({'iter_loss': loss}) --- epoch_loss += loss.item() --- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --+ ############################################################## --+ if args.rank == 0: --+ if epoch%args.checkbleu ==0 : --+ --+ bleu_score = checkbleu(model, tokenizer, test_loader, gpu) --+ wandb.log({'bleu_score': bleu_score}) --+ # print(bleu_score(predicted, target)) --+ ############################################################## --+ # if epoch%1 ==0 : --+ # torch.save(model.module.state_dict(), --+ # 'path.pth') --+ # print("Model is saved") --+ # if args.rank == 0: --+ # # save checkpoint --+ # state = dict(epoch=epoch + 1, model=model.state_dict(), --+ # optimizer=optimizer.state_dict()) --+ # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --+ # print('saved translation model in', args.checkpoint_dir) --+ wandb.finish() -- --- if step % args.print_freq == 0: --- if args.rank == 0: --- stats = dict(epoch=epoch, step=step, --- loss=loss.item(), --- time=int(time.time() - start_time)) --- print(json.dumps(stats)) --- print(json.dumps(stats), file=stats_file) --- # wandb.log({"epoch_loss":epoch_loss}) --- if args.rank == 0: --- # save checkpoint --- state = dict(epoch=epoch + 1, model=model.module.state_dict(), --- optimizer=optimizer.state_dict()) --- # print(model.state_dict) --- torch.save(state, args.checkpoint_dir / 'translation_checkpoint.pth') --- print('translation model saved in', args.checkpoint_dir) --- ---############################################################## --- if epoch%args.checkbleu ==0 : --+ else: --+ --+ bleu_score = checkbleu(model,tokenizer, test_loader, gpu ) --+ print('test_bleu_score', bleu_score) --+ if args.rank == 0: --+ wandb.log({'bleu_score': bleu_score}) --+ -- --- model.eval() --- predicted=[] --- target=[] --+def checkbleu(model, tokenizer, test_loader, gpu): --+ --+ model.eval() --+ predicted=[] --+ target=[] -- --- for i in test_loader: --- src = i[0].cuda(gpu, non_blocking=True) --- tgt_out = i[3].cuda(gpu, non_blocking=True) --- num_tokens = src.shape[0] --- --- src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --- out = translate(model, src, tokenizer, src_mask, gpu) --- predicted.append(out) --- target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) --+ for i in test_loader: --+ src = i[0].cuda(gpu, non_blocking=True) --+ tgt_out = i[3].cuda(gpu, non_blocking=True) --+ num_tokens = src.shape[0] --+ --+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).cuda(gpu, non_blocking=True) --+ out = translate(model, src, tokenizer, src_mask, gpu) --+ predicted.append(out) --+ target.append([tokenizer.convert_ids_to_tokens(tgt_out)]) -- --- try: --- bleu_score(predicted, target) --- except: --- predicted.pop() --- target.pop() --+ try: --+ bleu_score(predicted, target) --+ except: --+ predicted.pop() --+ target.pop() -- --- print(bleu_score(predicted, target)) ---############################################################## ---# if epoch%1 ==0 : ---# torch.save(model.module.state_dict(), ---# 'path.pth') ---# print("Model is saved") --- # if args.rank == 0: --- # # save checkpoint --- # state = dict(epoch=epoch + 1, model=model.state_dict(), --- # optimizer=optimizer.state_dict()) --- # torch.save(state, args.checkpoint_dir / f'translation_checkpoint.pth') --- # print('saved translation model in', args.checkpoint_dir) ---# wandb.finish() --- --+ bleu = bleu_score(predicted, target) -- --+ return bleu -- -- ''' -- todo: --@@ -360,3 +379,4 @@ def translate(model: torch.nn.Module, -- -- if __name__ == '__main__': -- main() --+ wandb.finish() --diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log --index b8c8383..6163657 120000 ----- a/wandb/debug-internal.log --+++ b/wandb/debug-internal.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug-internal.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug-internal.log --\ No newline at end of file --diff --git a/wandb/debug.log b/wandb/debug.log --index 1d77d77..7d0f5dd 120000 ----- a/wandb/debug.log --+++ b/wandb/debug.log --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8/logs/debug.log --\ No newline at end of file --+run-20220409_182749-paufev36/logs/debug.log --\ No newline at end of file --diff --git a/wandb/latest-run b/wandb/latest-run --index ad4b017..f11d588 120000 ----- a/wandb/latest-run --+++ b/wandb/latest-run --@@ -1 +1 @@ ---run-20220406_171518-s7zesus8 --\ No newline at end of file --+run-20220409_182749-paufev36 --\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/output.log b/wandb/run-20220409_182749-paufev36/files/output.log -deleted file mode 100644 -index 8a30e30..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/output.log -+++ /dev/null -@@ -1,55 +0,0 @@ -- --train_translation.py --batch_size=32 --dfeedforward=1024 --epochs=32 --nhead=2 --nlayers=4 --wandb: WARNING Config item 'epochs' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'batch_size' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nhead' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'dfeedforward' was locked by 'sweep' (ignored update). --wandb: WARNING Config item 'nlayers' was locked by 'sweep' (ignored update). --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) --Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias'] --- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). --- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --{"epoch": 0, "step": 0, "loss": 7.115720272064209, "time": 5} --/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py:264: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. -- torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) --{"epoch": 0, "step": 5, "loss": 202.97476196289062, "time": 6} --translation model saved in checkpoint --{"epoch": 1, "step": 10, "loss": 151.204345703125, "time": 62} --translation model saved in checkpoint --{"epoch": 2, "step": 15, "loss": 76.84952545166016, "time": 83} --translation model saved in checkpoint --{"epoch": 3, "step": 20, "loss": 50.71405029296875, "time": 105} --translation model saved in checkpoint --{"epoch": 4, "step": 25, "loss": 38.18907165527344, "time": 127} --Exception in thread Thread-3: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop -- msg = self._response_queue.get(timeout=1) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get -- res = self._recv_bytes() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes -- buf = self._recv_bytes(maxlength) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes -- buf = self._recv(4) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv -- raise EOFError --EOFError --Exception in thread Thread-16: --Traceback (most recent call last): -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner -- self.run() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run -- self._target(*self._args, **self._kwargs) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/wandb_run.py", line 198, in check_status -- status_response = self._interface.communicate_stop_status() -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 743, in communicate_stop_status -- resp = self._communicate(req, timeout=timeout, local=True) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 545, in _communicate -- return self._communicate_async(rec, local=local).get(timeout=timeout) -- File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 550, in _communicate_async -- raise Exception("The wandb backend process has shutdown") -diff --git a/wandb/run-20220409_182749-paufev36/files/requirements.txt b/wandb/run-20220409_182749-paufev36/files/requirements.txt -deleted file mode 100644 -index 59aa056..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/requirements.txt -+++ /dev/null -@@ -1,72 +0,0 @@ --aiohttp==3.8.1 --aiosignal==1.2.0 --async-timeout==4.0.2 --asynctest==0.13.0 --attrs==21.4.0 --blessings==1.7 --brotlipy==0.7.0 --certifi==2021.10.8 --cffi==1.15.0 --charset-normalizer==2.0.12 --click==8.0.4 --configparser==5.2.0 --cryptography==36.0.0 --datasets==1.16.1 --dill==0.3.4 --docker-pycreds==0.4.0 --filelock==3.6.0 --frozenlist==1.3.0 --fsspec==2022.2.0 --gitdb==4.0.9 --gitpython==3.1.27 --gpustat==0.6.0 --huggingface-hub==0.4.0 --idna==3.3 --importlib-metadata==4.11.3 --joblib==1.1.0 --mkl-fft==1.3.1 --mkl-random==1.2.2 --mkl-service==2.4.0 --multidict==6.0.2 --multiprocess==0.70.12.2 --numpy==1.21.5 --nvidia-ml-py3==7.352.0 --packaging==21.3 --pandas==1.3.5 --pathtools==0.1.2 --pillow==9.0.1 --pip==21.2.2 --promise==2.3 --protobuf==3.19.4 --psutil==5.9.0 --pyarrow==7.0.0 --pycparser==2.21 --pyopenssl==22.0.0 --pyparsing==3.0.7 --pysocks==1.7.1 --python-dateutil==2.8.2 --pytz==2022.1 --pyyaml==6.0 --regex==2022.3.15 --requests==2.27.1 --sacremoses==0.0.49 --sentry-sdk==1.5.8 --setuptools==58.0.4 --shortuuid==1.0.8 --six==1.16.0 --smmap==5.0.0 --subprocess32==3.5.4 --tokenizers==0.10.3 --torch==1.11.0 --torchaudio==0.11.0 --torchtext==0.12.0 --torchvision==0.12.0 --tqdm==4.63.1 --transformers==4.14.1 --typing-extensions==4.1.1 --urllib3==1.26.9 --wandb==0.10.31 --wheel==0.37.1 --xxhash==3.0.0 --yarl==1.7.2 --zipp==3.7.0 -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json b/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -deleted file mode 100644 -index ee6c1fa..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json -+++ /dev/null -@@ -1,30 +0,0 @@ --{ -- "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", -- "python": "3.7.11", -- "heartbeatAt": "2022-04-09T12:57:50.039943", -- "startedAt": "2022-04-09T12:57:49.399103", -- "docker": null, -- "gpu": "NVIDIA GeForce GTX 1080 Ti", -- "gpu_count": 2, -- "cpu_count": 8, -- "cuda": null, -- "args": [ -- "--batch_size=32", -- "--dfeedforward=1024", -- "--epochs=32", -- "--nhead=2", -- "--nlayers=4" -- ], -- "state": "running", -- "program": "/home/ivlabs/context_enhancement/context_new/context_enhancement/train_translation.py", -- "codePath": "train_translation.py", -- "git": { -- "remote": "https://github.com/IvLabs/context_enhancement.git", -- "commit": "eed2d749c090a46bca0d3e6791485b1c252d8633" -- }, -- "email": "aneeshashetye@gmail.com", -- "root": "/home/ivlabs/context_enhancement/context_new/context_enhancement", -- "host": "hubble-02", -- "username": "ivlabs", -- "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" --} -diff --git a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json b/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -deleted file mode 100644 -index 6be8521..0000000 ---- a/wandb/run-20220409_182749-paufev36/files/wandb-summary.json -+++ /dev/null -@@ -1 +0,0 @@ --{"epoch_loss": 287.689208984375, "_runtime": 137, "_timestamp": 1649509206, "_step": 5, "bleu_score": 0.0} -\ No newline at end of file -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log b/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -deleted file mode 100644 -index ade12de..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug-internal.log -+++ /dev/null -@@ -1,141 +0,0 @@ --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,431 DEBUG MainThread:25755 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send():179] send: header --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,435 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: check_version --2022-04-09 18:27:49,435 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: check_version --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:49,589 DEBUG SenderThread:25755 [sender.py:send():179] send: run --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:50,037 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: run_start --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():39] meta init --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:__init__():53] meta init done --2022-04-09 18:27:50,039 DEBUG HandlerThread:25755 [meta.py:probe():210] probe --2022-04-09 18:27:50,045 DEBUG HandlerThread:25755 [meta.py:_setup_git():200] setup git --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_setup_git():207] setup git done --2022-04-09 18:27:50,064 DEBUG HandlerThread:25755 [meta.py:_save_code():89] save code --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_code():110] save code done --2022-04-09 18:27:50,073 DEBUG HandlerThread:25755 [meta.py:_save_patches():127] save patches --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_patches():169] save patches done --2022-04-09 18:27:50,128 DEBUG HandlerThread:25755 [meta.py:_save_pip():57] save pip --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_pip():71] save pip done --2022-04-09 18:27:50,129 DEBUG HandlerThread:25755 [meta.py:_save_conda():78] save conda --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:_save_conda():86] save conda done --2022-04-09 18:27:51,517 DEBUG HandlerThread:25755 [meta.py:probe():252] probe done --2022-04-09 18:27:51,519 DEBUG SenderThread:25755 [sender.py:send():179] send: files --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,530 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:27:51,530 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:51,872 DEBUG SenderThread:25755 [sender.py:send():179] send: config --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:04,050 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:06,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:18,996 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,208 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:22,208 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:37,664 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:37,664 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:49,672 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:28:53,002 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:28:53,002 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:28:55,193 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,936 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:00,937 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:08,453 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:08,454 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:20,345 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:22,285 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:23,787 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:23,787 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:39,186 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:39,186 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:29:44,030 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:51,270 DEBUG SenderThread:25755 [sender.py:send():179] send: stats --2022-04-09 18:29:54,873 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:29:54,873 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: history --2022-04-09 18:30:06,522 DEBUG SenderThread:25755 [sender.py:send():179] send: summary --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:10,343 DEBUG HandlerThread:25755 [handler.py:handle_request():124] handle_request: stop_status --2022-04-09 18:30:10,343 DEBUG SenderThread:25755 [sender.py:send_request():193] send_request: stop_status --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/logs/debug.log b/wandb/run-20220409_182749-paufev36/logs/debug.log -deleted file mode 100644 -index 7b0f79c..0000000 ---- a/wandb/run-20220409_182749-paufev36/logs/debug.log -+++ /dev/null -@@ -1,92 +0,0 @@ --2022-04-09 18:27:49,403 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting env: {'entity': 'tera_squid', 'project': 'context_enhancement', 'sweep_id': '1t9pc38r', 'root_dir': '/home/ivlabs/context_enhancement/context_new/context_enhancement', 'run_id': 'paufev36', 'sweep_param_path': '/home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/sweep-1t9pc38r/config-paufev36.yaml', 'start_method': 'thread'} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_setup.py:_flush():69] setting login settings: {} --2022-04-09 18:27:49,404 INFO MainThread:25755 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/logs/debug-internal.log --2022-04-09 18:27:49,405 INFO MainThread:25755 [wandb_init.py:init():369] calling init triggers --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():376] wandb.init called with sweep_config: {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --config: {'workers': 4, 'epochs': 32, 'batch_size': 32, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'nhead': 2, 'dfeedforward': 1024, 'nlayers': 4, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:49,406 INFO MainThread:25755 [wandb_init.py:init():418] starting backend --2022-04-09 18:27:49,427 INFO MainThread:25755 [backend.py:ensure_launched():132] starting backend process... --2022-04-09 18:27:49,429 INFO MainThread:25755 [backend.py:ensure_launched():137] started backend process with pid: 0 --2022-04-09 18:27:49,430 INFO wandb_internal:25755 [internal.py:wandb_internal():91] W&B internal server running at pid: 25755, started at: 2022-04-09 18:27:49.428830 --2022-04-09 18:27:49,431 INFO MainThread:25755 [wandb_init.py:init():423] backend started and connected --2022-04-09 18:27:49,433 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'batch_size': 32, 'dfeedforward': 1024, 'epochs': 32, 'nhead': 2, 'nlayers': 4} --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():465] updated telemetry --2022-04-09 18:27:49,434 INFO MainThread:25755 [wandb_init.py:init():484] communicating current version --2022-04-09 18:27:49,435 INFO WriterThread:25755 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:27:49,585 INFO MainThread:25755 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" -- --2022-04-09 18:27:49,586 INFO MainThread:25755 [wandb_init.py:init():497] communicating run to backend with 30 second timeout --2022-04-09 18:27:50,034 INFO SenderThread:25755 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:27:50,034 INFO SenderThread:25755 [sender.py:_start_run_threads():707] run started: paufev36 with start time 1649509069 --2022-04-09 18:27:50,036 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:27:50,036 INFO MainThread:25755 [wandb_init.py:init():522] starting run threads in backend --2022-04-09 18:27:51,035 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:27:51,036 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now --2022-04-09 18:27:51,519 INFO SenderThread:25755 [sender.py:_save_file():829] saving file code/train_translation.py with policy now --2022-04-09 18:27:51,520 INFO SenderThread:25755 [sender.py:_save_file():829] saving file diff.patch with policy now --2022-04-09 18:27:51,528 INFO MainThread:25755 [wandb_run.py:_console_start():1538] atexit reg --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1449] Wrapping output streams. --2022-04-09 18:27:51,532 INFO MainThread:25755 [wandb_run.py:_redirect():1473] Redirects installed. --2022-04-09 18:27:51,533 INFO MainThread:25755 [wandb_init.py:init():547] run started, returning control to user process --2022-04-09 18:27:51,534 INFO MainThread:25755 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'dmodel': 768, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml --2022-04-09 18:27:52,045 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:52,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json --2022-04-09 18:27:52,686 INFO Thread-14 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3213fqcg-wandb-metadata.json --2022-04-09 18:27:52,691 INFO Thread-15 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/3tltefpg-code/train_translation.py --2022-04-09 18:27:53,694 INFO Thread-18 :25755 [upload_job.py:push():133] Uploaded file /tmp/tmpzveu7e54wandb/g47w6xsn-diff.patch --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:54,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:27:56,046 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:27:58,047 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:04,051 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:28:04,051 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:06,055 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:22,059 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:28:55,194 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:28:56,070 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:00,938 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:01,087 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:02,088 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:18,092 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:22,287 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:23,093 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:24,094 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:40,099 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:29:44,031 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:29:44,131 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:02,136 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:06,523 INFO SenderThread:25755 [sender.py:_save_file():829] saving file wandb-summary.json with policy end --2022-04-09 18:30:07,138 INFO Thread-11 :25755 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:15,029 WARNING wandb_internal:25755 [internal.py:is_dead():367] Internal process exiting, parent pid 25740 disappeared --2022-04-09 18:30:15,030 ERROR wandb_internal:25755 [internal.py:wandb_internal():143] Internal process shutdown. --2022-04-09 18:30:15,350 INFO HandlerThread:25755 [handler.py:finish():638] shutting down handler --2022-04-09 18:30:15,527 INFO WriterThread:25755 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/run-paufev36.wandb --2022-04-09 18:30:15,678 INFO SenderThread:25755 [sender.py:finish():933] shutting down sender --2022-04-09 18:30:15,678 INFO SenderThread:25755 [dir_watcher.py:finish():282] shutting down directory watcher --2022-04-09 18:30:16,139 INFO SenderThread:25755 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt requirements.txt --2022-04-09 18:30:16,140 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-metadata.json wandb-metadata.json --2022-04-09 18:30:16,142 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log output.log --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml conda-environment.yaml --2022-04-09 18:30:16,143 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json wandb-summary.json --2022-04-09 18:30:16,145 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml config.yaml --2022-04-09 18:30:16,150 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/diff.patch diff.patch --2022-04-09 18:30:16,152 INFO SenderThread:25755 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/code/train_translation.py code/train_translation.py --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:finish():176] shutting down file pusher --2022-04-09 18:30:16,152 INFO SenderThread:25755 [file_pusher.py:join():181] waiting for file pusher --2022-04-09 18:30:17,012 INFO Thread-30 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/output.log --2022-04-09 18:30:17,026 INFO Thread-32 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/wandb-summary.json --2022-04-09 18:30:17,131 INFO Thread-33 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/config.yaml --2022-04-09 18:30:17,133 INFO Thread-29 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/requirements.txt --2022-04-09 18:30:17,424 INFO Thread-31 :25755 [upload_job.py:push():133] Uploaded file /home/ivlabs/context_enhancement/context_new/context_enhancement/wandb/run-20220409_182749-paufev36/files/conda-environment.yaml -diff --git a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb b/wandb/run-20220409_182749-paufev36/run-paufev36.wandb -deleted file mode 100644 -index 70babdb..0000000 -Binary files a/wandb/run-20220409_182749-paufev36/run-paufev36.wandb and /dev/null differ -diff --git a/wandb/sweep-1t9pc38r/config-paufev36.yaml b/wandb/sweep-1t9pc38r/config-paufev36.yaml -deleted file mode 100644 -index da3e8b2..0000000 ---- a/wandb/sweep-1t9pc38r/config-paufev36.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 4 -diff --git a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml b/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -deleted file mode 100644 -index d68afea..0000000 ---- a/wandb/sweep-1t9pc38r/config-vjrenr4z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml b/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -deleted file mode 100644 -index cc3235e..0000000 ---- a/wandb/sweep-1t9pc38r/config-z44hpswp.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml b/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -deleted file mode 100644 -index 24fc0f6..0000000 ---- a/wandb/sweep-7o7qjhjd/config-2o0jaujh.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 1024 --epochs: -- value: 24 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml b/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -deleted file mode 100644 -index eeb3936..0000000 ---- a/wandb/sweep-7o7qjhjd/config-hie2vfqk.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml b/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -deleted file mode 100644 -index f88591e..0000000 ---- a/wandb/sweep-7o7qjhjd/config-lfenfbqz.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-abict4v2.yaml b/wandb/sweep-lrpyor0l/config-abict4v2.yaml -deleted file mode 100644 -index 1b97c5e..0000000 ---- a/wandb/sweep-lrpyor0l/config-abict4v2.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 20 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml b/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -deleted file mode 100644 -index 426c8ac..0000000 ---- a/wandb/sweep-lrpyor0l/config-ba0yl54z.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml b/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -deleted file mode 100644 -index caf5f78..0000000 ---- a/wandb/sweep-lrpyor0l/config-d3rkwo1k.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml b/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -deleted file mode 100644 -index 6b7d3c1..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjhaj183.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 4 -diff --git a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml b/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -deleted file mode 100644 -index 8f11b7e..0000000 ---- a/wandb/sweep-lrpyor0l/config-fjlzyv53.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml b/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -deleted file mode 100644 -index d3a2560..0000000 ---- a/wandb/sweep-lrpyor0l/config-orkb33ld.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 32 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml b/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -deleted file mode 100644 -index 403014d..0000000 ---- a/wandb/sweep-q27ijx1y/config-dg43ixc4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 512 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml b/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -deleted file mode 100644 -index d1bf3d8..0000000 ---- a/wandb/sweep-q27ijx1y/config-fwwd5rya.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 40 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml b/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -deleted file mode 100644 -index 258ae0c..0000000 ---- a/wandb/sweep-yoroy32u/config-2dzyn8ls.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 6 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml b/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -deleted file mode 100644 -index dbe827a..0000000 ---- a/wandb/sweep-yoroy32u/config-7a0i8c1o.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --nhead: -- value: 8 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml b/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -deleted file mode 100644 -index 3aeb285..0000000 ---- a/wandb/sweep-yoroy32u/config-7wn11wz9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 512 --epochs: -- value: 40 --nhead: -- value: 4 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml b/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -deleted file mode 100644 -index ccb6734..0000000 ---- a/wandb/sweep-yoroy32u/config-aqxf4pp9.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 32 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yoroy32u/config-gjih072d.yaml b/wandb/sweep-yoroy32u/config-gjih072d.yaml -deleted file mode 100644 -index 73e8e4c..0000000 ---- a/wandb/sweep-yoroy32u/config-gjih072d.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml b/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -deleted file mode 100644 -index 9d822c0..0000000 ---- a/wandb/sweep-yoroy32u/config-poi9dsbs.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml b/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -deleted file mode 100644 -index f0bd5df..0000000 ---- a/wandb/sweep-yoroy32u/config-th5i0wo4.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 64 --dfeedforward: -- value: 256 --epochs: -- value: 36 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yoroy32u/config-uh7twoim.yaml b/wandb/sweep-yoroy32u/config-uh7twoim.yaml -deleted file mode 100644 -index 508d9e2..0000000 ---- a/wandb/sweep-yoroy32u/config-uh7twoim.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 256 --epochs: -- value: 20 --nhead: -- value: 6 --nlayers: -- value: 2 -diff --git a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml b/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -deleted file mode 100644 -index 83311a7..0000000 ---- a/wandb/sweep-yoroy32u/config-zf5ccuzv.yaml -+++ /dev/null -@@ -1,12 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 1024 --epochs: -- value: 16 --nhead: -- value: 2 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml b/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -deleted file mode 100644 -index 4f6dc35..0000000 ---- a/wandb/sweep-yvfclyxy/config-luzuebmc.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 256 --dfeedforward: -- value: 256 --epochs: -- value: 36 --lambd: -- value: 0.4 --nhead: -- value: 4 --nlayers: -- value: 6 -diff --git a/wandb/sweep-yvfclyxy/config-padai7jf.yaml b/wandb/sweep-yvfclyxy/config-padai7jf.yaml -deleted file mode 100644 -index 9b19315..0000000 ---- a/wandb/sweep-yvfclyxy/config-padai7jf.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 16 --dfeedforward: -- value: 1024 --epochs: -- value: 28 --lambd: -- value: 0.55 --nhead: -- value: 8 --nlayers: -- value: 4 -diff --git a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml b/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -deleted file mode 100644 -index 8a8a9b2..0000000 ---- a/wandb/sweep-yvfclyxy/config-r4bjt76k.yaml -+++ /dev/null -@@ -1,14 +0,0 @@ --wandb_version: 1 -- --batch_size: -- value: 128 --dfeedforward: -- value: 256 --epochs: -- value: 24 --lambd: -- value: 0.2 --nhead: -- value: 2 --nlayers: -- value: 4 diff --git a/wandb/run-20220416_014323-1a0lobwa/files/output.log b/wandb/run-20220416_014323-1a0lobwa/files/output.log deleted file mode 100644 index 94424a5..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/output.log +++ /dev/null @@ -1,106 +0,0 @@ - -train_translation.py --load=0 -Reusing dataset opus_rf (/home/ivlabs/.cache/huggingface/datasets/opus_rf/de-en/1.0.0/3725eb23f8df679ddd37d8d65a6bbfcda7732c66edccbc62a3c3b1354c934c9f) -Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). -{"epoch": 0, "step": 0, "loss": 7.128603458404541, "time": 9} -/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py:275: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_. - torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) -{"epoch": 0, "step": 5, "loss": 156.04449462890625, "time": 39} -{"epoch": 0, "step": 10, "loss": 154.7353515625, "time": 67} -translation model saved in checkpoint -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'level', 'of', 'employment', 'in', 'this', 'country', 'is', 'high', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['on', 'the', 'th', '##res', '##hold', 'of', 'the', 'nine', '##ties', ',', 'we', 'should', 'con', '##fir', '##m', 'and', 'strength', '##en', 'in', 'the', 'long', 'term', 'what', 'must', 'be', 'the', 'basis', 'for', 'future', 'developments', 'too', ':', 'the', 'economy', ',', 'full', 'employment', 'and', 'welfare', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', 'is', 'a', 'good', 'country', 'for', 'enterprise', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['proposals', 'will', 'be', 'put', 'forward', 'for', 'increasing', 'competition', 'and', 'keeping', 'down', 'costs', 'in', 'areas', 'where', 'price', 'trends', 'are', 'boost', '##ing', 'inflation', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['later', 'in', 'the', 'electoral', 'period', ',', 'proposals', 'will', 'be', 'put', 'forward', 'for', 'an', 'extensive', 'reform', 'of', 'the', 'tax', 'system', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['2', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['3', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['opportunities', 'for', 'study', ',', 'work', 'and', 'cultural', 'exchange', '##s', 'across', 'national', 'boundaries', 'will', 'be', 'extended', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', "'", 's', 'economic', 'situation', 'has', 'improved', 'substantial', '##ly', 'in', 'recent', 'years', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['consideration', 'for', 'the', 'environment', 'and', 'the', 'countryside', 'must', 'character', '##ize', 'developments', 'in', 'all', 'fields', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'policies', 'we', 'pursue', 'in', 'the', 'next', 'three', 'years', 'will', 'leave', 'their', 'mark', 'on', 'developments', 'in', 'this', 'country', 'during', 'the', 'decade', 'to', 'come', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['in', 'this', 'context', ',', 'the', 'requirements', 'of', 'full', 'employment', ',', 'welfare', ',', 'a', 'good', 'working', 'environment', 'and', 'trade', 'union', 'participation', 'will', 'be', 'key', 'issues', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'reduction', 'of', 'the', 'marginal', 'income', '-', 'tax', 'rate', 'in', '1989', 'by', '3', 'percentage', 'points', 'will', 'be', 'proposed', 'in', 'the', 'first', 'place', 'for', 'full', '-', 'time', 'employees', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', "'", 's', 'agricultural', 'policy', 'aims', 'to', 'promote', 'farming', 'that', 'th', '##rive', '##s', 'without', 'having', 'dama', '##ging', 'effects', 'on', 'the', 'environment', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['staff', 'rec', '##ruit', '##ment', 'within', 'the', 'car', '##ing', 'services', 'must', 'be', 'improved', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'aim', 'is', 'to', 'sti', '##mula', '##te', 'work', 'and', 'saving', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['preliminary', 'inspection', 'of', 'new', 'chemical', 'substances', 'will', 'be', 'introduced', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['measures', 'will', 'be', 'taken', 'to', 'protect', 'the', 'visual', 'amen', '##ity', 'of', 'the', 'open', 'landscape', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['4', '.', 'security', 'and', 'responsibility', 'will', 'character', '##ize', 'society', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['it', 'will', 'be', 'pursued', 'with', 'firm', '##ness', 'and', 'consiste', '##ncy', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['international', 'co', '-', 'operation', 'within', 'research', 'and', 'development', 'is', 'becoming', 'increasingly', 'important', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['recu', '##rren', '##t', 'training', 'in', 'working', 'life', 'and', 'qualified', 'further', 'education', 'will', 'play', 'an', 'important', 'role', 'in', 'this', 'context', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['during', 'this', 'electoral', 'period', ',', 'legislation', 'on', 'a', 'sixth', 'week', 'of', 'annual', 'holiday', 'with', 'pay', 'will', 'be', 'ena', '##cted', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['industrial', 'products', 'and', 'processes', 'are', 'to', 'be', 'clean', '##er', 'through', 'string', '##ent', 'requirements', 'and', 'rapid', 'adaptation', 'to', 'new', 'technology', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', 'will', 'put', 'forward', 'proposals', 'for', 'developing', 'police', 'work', 'and', 'making', 'it', 'more', 'effective', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['during', 'the', 'coming', 'electoral', 'period', ',', 'sek', '300', 'million', 'will', 'be', 'ear', '##mark', '##ed', 'for', 'the', 'rene', '##wal', 'and', 'development', 'of', 'cultural', 'life', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['the', 'government', 'inte', '##nds', 'to', 'pursue', 'a', 'food', 'policy', 'such', 'that', 'the', 'price', 'trend', 'is', 'check', '##ed', 'and', 'the', 'consumers', 'offered', 'food', 'at', 'reason', '##able', 'prices', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['resources', 'will', 'be', 'set', 'free', 'for', 'the', 'provision', 'of', 'housing', 'by', 'limit', '##ing', 'other', 'construction', 'projects', 'in', 'over', '##hea', '##ted', 'regions', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', 'has', 'participated', 'in', 'practical', '##ly', 'all', 'the', 'united', 'nations', 'operations', 'of', 'this', 'kind', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['immigrants', "'", 'entry', 'into', 'the', 'labour', 'market', 'should', 'be', 'facilitate', '##d', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['special', 'measures', 'will', 'be', 'applied', 'in', 'regions', 'particularly', 'exposed', ',', 'for', 'example', 'west', 'skane', 'and', 'the', 'sund', '##s', '##vall', '/', 'tim', '##ra', 'area', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['sweden', "'", 's', 'commitment', 'and', 'responsibility', 'does', 'not', 'end', 'at', 'europe', "'", 's', 'borders', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['places', 'in', 'the', 'upper', 'secondary', 'school', 'will', 'be', 'available', 'to', 'all', 'young', 'people', 'under', 'the', 'age', 'of', '20', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'proposal', 'for', 'a', 'lower', 'legal', 'limit', 'for', 'the', 'offen', '##ce', 'of', 'driving', 'with', 'ability', 'imp', '##aire', '##d', 'by', 'alcohol', 'will', 'be', 'submitted', 'to', 'parliament', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['development', 'assistance', 'shall', 'furthermore', 'promote', 'a', 'sustainable', 'use', 'of', 'natural', 'resources', 'and', 'protection', 'of', 'the', 'environment', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['there', 'is', 'broad', 'political', 'consensus', 'and', 'support', 'for', 'tac', '##kling', 'the', 'environmental', 'problems', 'force', '##fully', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['proposals', 'for', 'pollution', 'charges', 'for', 'other', 'substances', ',', 'among', 'them', 'carbon', 'dio', '##xide', ',', 'will', 'be', 'presented', 'during', 'this', 'term', 'of', 'office', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['a', 'world', 'in', 'deep', 'economic', 'and', 'social', 'im', '##bala', '##nce', 'will', 'never', 'be', 'safe', '.', '[SEP]'] -out ['s', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the'] -predicted ['i', 'should', 'like', 'to', 'welcome', 'the', 'members', 'of', 'the', 'environment', 'party', 'the', 'green', '##s', 'to', 'what', 'i', 'hope', 'will', 'be', 'ins', '##pi', '##ring', 'parliamentary', 'work', '.', '[SEP]'] -Exception in thread Thread-3: -Traceback (most recent call last): - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 926, in _bootstrap_inner - self.run() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/threading.py", line 870, in run - self._target(*self._args, **self._kwargs) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/site-packages/wandb/sdk/interface/interface.py", line 114, in message_loop - msg = self._response_queue.get(timeout=1) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/queues.py", line 108, in get - res = self._recv_bytes() - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes - buf = self._recv_bytes(maxlength) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes - buf = self._recv(4) - File "/home/ivlabs/miniconda3/envs/ectc/lib/python3.7/multiprocessing/connection.py", line 383, in _recv - raise EOFError diff --git a/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt b/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt deleted file mode 100644 index 5ddce70..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt +++ /dev/null @@ -1,107 +0,0 @@ -aiohttp==3.8.1 -aiosignal==1.2.0 -antlr4-python3-runtime==4.8 -async-timeout==4.0.2 -asynctest==0.13.0 -attrs==21.4.0 -backcall==0.2.0 -bitarray==2.4.1 -blessings==1.7 -brotlipy==0.7.0 -certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.0.4 -colorama==0.4.4 -configparser==5.2.0 -cryptography==36.0.0 -cython==0.29.28 -datasets==1.16.1 -debugpy==1.6.0 -decorator==5.1.1 -dill==0.3.4 -docker-pycreds==0.4.0 -entrypoints==0.4 -fairseq==1.0.0a0 -fastbpe==0.1.0 -filelock==3.6.0 -frozenlist==1.3.0 -fsspec==2022.2.0 -gitdb==4.0.9 -gitpython==3.1.27 -gpustat==0.6.0 -huggingface-hub==0.4.0 -hydra-core==1.0.7 -idna==3.3 -importlib-metadata==4.11.3 -importlib-resources==5.6.0 -ipykernel==6.12.1 -ipython==7.32.0 -jedi==0.18.1 -joblib==1.1.0 -jupyter-client==7.2.2 -jupyter-core==4.9.2 -matplotlib-inline==0.1.3 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -mock==4.0.3 -multidict==6.0.2 -multiprocess==0.70.12.2 -nest-asyncio==1.5.5 -numpy==1.21.5 -nvidia-ml-py3==7.352.0 -omegaconf==2.0.6 -packaging==21.3 -pandas==1.3.5 -parso==0.8.3 -pathtools==0.1.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pillow==9.0.1 -pip==21.2.2 -portalocker==2.4.0 -promise==2.3 -prompt-toolkit==3.0.29 -protobuf==3.19.4 -psutil==5.9.0 -ptyprocess==0.7.0 -pyarrow==7.0.0 -pycparser==2.21 -pygments==2.11.2 -pyopenssl==22.0.0 -pyparsing==3.0.7 -pysocks==1.7.1 -python-dateutil==2.8.2 -pytz==2022.1 -pyyaml==6.0 -pyzmq==22.3.0 -regex==2022.3.15 -requests==2.27.1 -sacrebleu==2.0.0 -sacremoses==0.0.49 -sentry-sdk==1.5.8 -setuptools==58.0.4 -shortuuid==1.0.8 -six==1.16.0 -smmap==5.0.0 -subprocess32==3.5.4 -subword-nmt==0.3.8 -tabulate==0.8.9 -tokenizers==0.10.3 -torch==1.11.0 -torchaudio==0.11.0 -torchtext==0.12.0 -torchvision==0.12.0 -tornado==6.1 -tqdm==4.63.1 -traitlets==5.1.1 -transformers==4.14.1 -typing-extensions==4.1.1 -urllib3==1.26.9 -wandb==0.10.31 -wcwidth==0.2.5 -wheel==0.37.1 -xxhash==3.0.0 -yarl==1.7.2 -zipp==3.7.0 \ No newline at end of file diff --git a/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json b/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json deleted file mode 100644 index df71503..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "os": "Linux-5.13.0-39-generic-x86_64-with-debian-bullseye-sid", - "python": "3.7.11", - "heartbeatAt": "2022-04-15T20:13:24.853414", - "startedAt": "2022-04-15T20:13:23.783007", - "docker": null, - "gpu": "NVIDIA GeForce GTX 1080 Ti", - "gpu_count": 2, - "cpu_count": 8, - "cuda": null, - "args": [ - "--load=0" - ], - "state": "running", - "program": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement/train_translation.py", - "codePath": "train_translation.py", - "git": { - "remote": "https://github.com/IvLabs/context_enhancement.git", - "commit": "3f7c03274d50f816db3079adcb4d4125620373b6" - }, - "email": "aneeshashetye@gmail.com", - "root": "/home/ivlabs/context_enhancement/context_new/new/context_enhancement", - "host": "hubble-02", - "username": "ivlabs", - "executable": "/home/ivlabs/miniconda3/envs/ectc/bin/python" -} diff --git a/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json b/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json deleted file mode 100644 index e0c4e63..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json +++ /dev/null @@ -1 +0,0 @@ -{"epoch_loss": 137.94474399089813, "_runtime": 83, "_timestamp": 1650053686, "_step": 0} \ No newline at end of file diff --git a/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log b/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log deleted file mode 100644 index 1294372..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log +++ /dev/null @@ -1,117 +0,0 @@ -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:43:23,790 INFO wandb_internal:6896 [internal.py:wandb_internal():91] W&B internal server running at pid: 6896, started at: 2022-04-16 01:43:23.789717 -2022-04-16 01:43:23,791 INFO MainThread:6896 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:43:23,791 DEBUG MainThread:6896 [config_util.py:dict_from_config_file():101] no default config file found in config-defaults.yaml -2022-04-16 01:43:23,792 INFO MainThread:6896 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:43:23,793 INFO MainThread:6896 [wandb_init.py:init():484] communicating current version -2022-04-16 01:43:23,795 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: check_version -2022-04-16 01:43:23,793 INFO WriterThread:6896 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:43:23,796 DEBUG SenderThread:6896 [sender.py:send():179] send: header -2022-04-16 01:43:23,796 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: check_version -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:43:24,122 DEBUG SenderThread:6896 [sender.py:send():179] send: run -2022-04-16 01:43:24,850 INFO MainThread:6896 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:43:24,850 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: run_start -2022-04-16 01:43:24,851 INFO SenderThread:6896 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_start_run_threads():707] run started: 1a0lobwa with start time 1650053603 -2022-04-16 01:43:24,851 DEBUG SenderThread:6896 [sender.py:send():179] send: summary -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:__init__():39] meta init -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:__init__():53] meta init done -2022-04-16 01:43:24,853 DEBUG HandlerThread:6896 [meta.py:probe():210] probe -2022-04-16 01:43:24,859 DEBUG HandlerThread:6896 [meta.py:_setup_git():200] setup git -2022-04-16 01:43:24,876 DEBUG HandlerThread:6896 [meta.py:_setup_git():207] setup git done -2022-04-16 01:43:24,876 DEBUG HandlerThread:6896 [meta.py:_save_code():89] save code -2022-04-16 01:43:24,886 DEBUG HandlerThread:6896 [meta.py:_save_code():110] save code done -2022-04-16 01:43:24,886 DEBUG HandlerThread:6896 [meta.py:_save_patches():127] save patches -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_patches():169] save patches done -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_pip():57] save pip -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_pip():71] save pip done -2022-04-16 01:43:24,961 DEBUG HandlerThread:6896 [meta.py:_save_conda():78] save conda -2022-04-16 01:43:25,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch -2022-04-16 01:43:25,855 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code -2022-04-16 01:43:26,705 DEBUG HandlerThread:6896 [meta.py:_save_conda():86] save conda done -2022-04-16 01:43:26,705 DEBUG HandlerThread:6896 [meta.py:probe():252] probe done -2022-04-16 01:43:26,708 DEBUG SenderThread:6896 [sender.py:send():179] send: files -2022-04-16 01:43:26,708 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:43:26,709 INFO SenderThread:6896 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:43:26,710 INFO SenderThread:6896 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:43:26,718 INFO MainThread:6896 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:43:26,719 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:26,719 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:43:26,719 INFO MainThread:6896 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:43:26,722 INFO MainThread:6896 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:27,375 DEBUG SenderThread:6896 [sender.py:send():179] send: config -2022-04-16 01:43:28,355 INFO Thread-15 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2u1coito-code/train_translation.py -2022-04-16 01:43:28,852 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,939 INFO Thread-14 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2kqba8ii-wandb-metadata.json -2022-04-16 01:43:29,213 INFO Thread-22 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/26d72ylc-diff.patch -2022-04-16 01:43:29,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml -2022-04-16 01:43:30,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:32,881 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:42,376 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:42,376 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:43:44,886 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:53,405 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:43:58,051 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:43:58,052 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:12,895 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:13,751 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:13,751 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:23,825 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:44:29,521 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:29,521 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:42,905 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:45,209 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:44:45,210 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:44:46,692 DEBUG SenderThread:6896 [sender.py:send():179] send: history -2022-04-16 01:44:46,692 DEBUG SenderThread:6896 [sender.py:send():179] send: summary -2022-04-16 01:44:46,692 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:44:46,909 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:44:54,512 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:45:00,912 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:45:00,912 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:45:00,917 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:04,918 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:06,919 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:08,920 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:10,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:12,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:14,922 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:16,688 DEBUG HandlerThread:6896 [handler.py:handle_request():124] handle_request: stop_status -2022-04-16 01:45:16,688 DEBUG SenderThread:6896 [sender.py:send_request():193] send_request: stop_status -2022-04-16 01:45:16,926 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:18,927 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:20,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:22,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:24,929 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:25,143 DEBUG SenderThread:6896 [sender.py:send():179] send: stats -2022-04-16 01:45:26,144 INFO SenderThread:6896 [sender.py:finish():933] shutting down sender -2022-04-16 01:45:26,144 INFO WriterThread:6896 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:45:26,144 INFO SenderThread:6896 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:45:26,891 INFO MainThread:6896 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:45:26,892 INFO MainThread:6896 [wandb_run.py:_restore():1480] restore -2022-04-16 01:45:26,930 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,930 INFO SenderThread:6896 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt requirements.txt -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log output.log -2022-04-16 01:45:26,934 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:45:26,938 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json wandb-summary.json -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml config.yaml -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch diff.patch -2022-04-16 01:45:26,949 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py code/train_translation.py -2022-04-16 01:45:26,949 INFO SenderThread:6896 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:45:26,950 INFO SenderThread:6896 [file_pusher.py:join():181] waiting for file pusher diff --git a/wandb/run-20220416_014323-1a0lobwa/logs/debug.log b/wandb/run-20220416_014323-1a0lobwa/logs/debug.log deleted file mode 100644 index 4a5d442..0000000 --- a/wandb/run-20220416_014323-1a0lobwa/logs/debug.log +++ /dev/null @@ -1,81 +0,0 @@ -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_setup.py:_flush():69] setting env: {'start_method': 'thread'} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_setup.py:_flush():69] setting login settings: {} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:_log_setup():336] Logging user logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/logs/debug.log -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:_log_setup():337] Logging internal logs to /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/logs/debug-internal.log -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():369] calling init triggers -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():376] wandb.init called with sweep_config: {} -config: {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': (0.9, 0.98), 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': PosixPath('checkpoint'), 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:23,784 INFO MainThread:6896 [wandb_init.py:init():418] starting backend -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():132] starting backend process... -2022-04-16 01:43:23,789 INFO MainThread:6896 [backend.py:ensure_launched():137] started backend process with pid: 0 -2022-04-16 01:43:23,790 INFO wandb_internal:6896 [internal.py:wandb_internal():91] W&B internal server running at pid: 6896, started at: 2022-04-16 01:43:23.789717 -2022-04-16 01:43:23,791 INFO MainThread:6896 [wandb_init.py:init():423] backend started and connected -2022-04-16 01:43:23,792 INFO MainThread:6896 [wandb_init.py:init():465] updated telemetry -2022-04-16 01:43:23,793 INFO MainThread:6896 [wandb_init.py:init():484] communicating current version -2022-04-16 01:43:23,793 INFO WriterThread:6896 [datastore.py:open_for_write():77] open: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():489] got version response upgrade_message: "wandb version 0.12.14 is available! To upgrade, please run:\n $ pip install wandb --upgrade" - -2022-04-16 01:43:24,121 INFO MainThread:6896 [wandb_init.py:init():497] communicating run to backend with 30 second timeout -2022-04-16 01:43:24,850 INFO MainThread:6896 [wandb_init.py:init():522] starting run threads in backend -2022-04-16 01:43:24,851 INFO SenderThread:6896 [dir_watcher.py:__init__():168] watching files in: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_start_run_threads():707] run started: 1a0lobwa with start time 1650053603 -2022-04-16 01:43:24,851 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:43:25,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:43:25,854 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch -2022-04-16 01:43:25,855 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code -2022-04-16 01:43:26,708 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-metadata.json with policy now -2022-04-16 01:43:26,709 INFO SenderThread:6896 [sender.py:_save_file():829] saving file code/train_translation.py with policy now -2022-04-16 01:43:26,710 INFO SenderThread:6896 [sender.py:_save_file():829] saving file diff.patch with policy now -2022-04-16 01:43:26,718 INFO MainThread:6896 [wandb_run.py:_console_start():1538] atexit reg -2022-04-16 01:43:26,719 INFO MainThread:6896 [wandb_run.py:_redirect():1412] redirect: SettingsConsole.WRAP -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1449] Wrapping output streams. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_run.py:_redirect():1473] Redirects installed. -2022-04-16 01:43:26,721 INFO MainThread:6896 [wandb_init.py:init():547] run started, returning control to user process -2022-04-16 01:43:26,722 INFO MainThread:6896 [wandb_run.py:_config_callback():787] config_cb None None {'workers': 4, 'epochs': 10, 'batch_size': 16, 'learning_rate': 0.2, 'dropout': 0.01, 'weight_decay': 1e-06, 'momentum': 0.9, 'clip': 1, 'betas': [0.9, 0.98], 'eps': 1e-09, 'loss_fn': 'cross_entropy', 'optimizer': 'adam', 'dmodel': 768, 'nhead': 4, 'dfeedforward': 200, 'nlayers': 3, 'projector': '768-256', 'tokenizer': 'bert-base-multilingual-uncased', 'mbert_out_size': 768, 'checkpoint_dir': 'checkpoint', 'load': 0, 'checkbleu': 5, 'train': True, 'print_freq': 5, 'test_translation': 0, 'ngpus_per_node': 2, 'rank': 0, 'dist_url': 'tcp://localhost:58472', 'world_size': 2} -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json -2022-04-16 01:43:26,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_created():216] file/dir created: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,355 INFO Thread-15 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2u1coito-code/train_translation.py -2022-04-16 01:43:28,852 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:28,939 INFO Thread-14 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/2kqba8ii-wandb-metadata.json -2022-04-16 01:43:29,213 INFO Thread-22 :6896 [upload_job.py:push():133] Uploaded file /tmp/tmpihia6f2xwandb/26d72ylc-diff.patch -2022-04-16 01:43:29,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml -2022-04-16 01:43:30,853 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:32,881 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:43:44,886 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:12,895 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:42,905 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:44:46,692 INFO SenderThread:6896 [sender.py:_save_file():829] saving file wandb-summary.json with policy end -2022-04-16 01:44:46,909 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json -2022-04-16 01:45:00,917 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:04,918 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:06,919 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:08,920 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:10,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:12,921 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:14,922 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:16,926 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:18,927 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:20,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:22,928 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:24,929 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,144 INFO SenderThread:6896 [sender.py:finish():933] shutting down sender -2022-04-16 01:45:26,144 INFO WriterThread:6896 [datastore.py:close():258] close: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb -2022-04-16 01:45:26,144 INFO SenderThread:6896 [dir_watcher.py:finish():282] shutting down directory watcher -2022-04-16 01:45:26,891 INFO MainThread:6896 [wandb_run.py:_atexit_cleanup():1508] got exitcode: 255 -2022-04-16 01:45:26,892 INFO MainThread:6896 [wandb_run.py:_restore():1480] restore -2022-04-16 01:45:26,930 INFO Thread-11 :6896 [dir_watcher.py:_on_file_modified():229] file/dir modified: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log -2022-04-16 01:45:26,930 INFO SenderThread:6896 [dir_watcher.py:finish():312] scan: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/requirements.txt requirements.txt -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-metadata.json wandb-metadata.json -2022-04-16 01:45:26,931 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/output.log output.log -2022-04-16 01:45:26,934 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/conda-environment.yaml conda-environment.yaml -2022-04-16 01:45:26,938 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/wandb-summary.json wandb-summary.json -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/config.yaml config.yaml -2022-04-16 01:45:26,941 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/diff.patch diff.patch -2022-04-16 01:45:26,949 INFO SenderThread:6896 [dir_watcher.py:finish():318] scan save: /home/ivlabs/context_enhancement/context_new/new/context_enhancement/wandb/run-20220416_014323-1a0lobwa/files/code/train_translation.py code/train_translation.py -2022-04-16 01:45:26,949 INFO SenderThread:6896 [file_pusher.py:finish():176] shutting down file pusher -2022-04-16 01:45:26,950 INFO SenderThread:6896 [file_pusher.py:join():181] waiting for file pusher diff --git a/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb b/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb deleted file mode 100644 index a79c900..0000000 Binary files a/wandb/run-20220416_014323-1a0lobwa/run-1a0lobwa.wandb and /dev/null differ